PyPI - ragbits-evaluate - Versions diffs - 1.4.0.dev202509220622__tar.gz → 1.4.0.dev202511160236__tar.gz - Mend

ragbits-evaluate 1.4.0.dev202509220622tar.gz → 1.4.0.dev202511160236tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

{ragbits_evaluate-1.4.0.dev202509220622 → ragbits_evaluate-1.4.0.dev202511160236}/CHANGELOG.md RENAMED Viewed

@@ -2,6 +2,8 @@
 ## Unreleased
+- Feat: introduce agent evaluation pipelines and metrics (HotpotQA, HumanEval, GAIA) (#829)
 ## 1.3.0 (2025-09-11)
 ### Changed

{ragbits_evaluate-1.4.0.dev202509220622 → ragbits_evaluate-1.4.0.dev202511160236}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ragbits-evaluate
-Version: 1.4.0.dev202509220622
+Version: 1.4.0.dev202511160236
 Summary: Evaluation module for Ragbits components
 Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
 Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
@@ -27,7 +27,7 @@ Requires-Dist: distilabel<2.0.0,>=1.5.0
 Requires-Dist: hydra-core<2.0.0,>=1.3.2
 Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
 Requires-Dist: optuna<5.0.0,>=4.0.0
-Requires-Dist: ragbits-core==1.4.0.dev202509220622
+Requires-Dist: ragbits-core==1.4.0.dev202511160236
 Provides-Extra: relari
 Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
 Description-Content-Type: text/markdown

{ragbits_evaluate-1.4.0.dev202509220622 → ragbits_evaluate-1.4.0.dev202511160236}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "ragbits-evaluate"
-version = "1.4.0.dev202509220622"
+version = "1.4.0.dev202511160236"
 description = "Evaluation module for Ragbits components"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -32,7 +32,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Software Development :: Libraries :: Python Modules",
 ]
-dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.4.0.dev202509220622"]
+dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.4.0.dev202511160236"]
 [project.urls]
 "Homepage" = "https://github.com/deepsense-ai/ragbits"

ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/dataloaders/gaia.py ADDED Viewed

@@ -0,0 +1,78 @@
+from collections.abc import Iterable
+from ragbits.core.sources.base import Source
+from ragbits.evaluate.dataloaders.base import DataLoader
+from ragbits.evaluate.pipelines.gaia import GaiaData
+class GaiaDataLoader(DataLoader[GaiaData]):
+    """
+    GAIA benchmark evaluation data loader.
+    The source should point to a local/remote JSON or JSONL file exported from the
+    Hugging Face dataset `gaia-benchmark/GAIA`. Rows are expected to contain at least:
+    - "task_id" (str)
+    - "Question" (str)
+    - "Level" (int)
+    - "Final answer" (str)
+    """
+    def __init__(
+        self,
+        source: Source,
+        *,
+        split: str = "data",
+        task_id_key: str = "task_id",
+        question_key: str = "Question",
+        level_key: str = "Level",
+        final_answer_key: str = "Final answer",
+        file_name_key: str = "file_name",
+        skip_file_attachments: bool = False,
+    ) -> None:
+        """
+        Initialize the GAIA data loader.
+        Args:
+            source: The source to load the data from.
+            split: The split to load the data from (file name generated by the source helper).
+            task_id_key: Column name for GAIA task identifier.
+            question_key: Column name for the natural language question.
+            level_key: Column name for numeric difficulty level (1, 2, 3).
+            final_answer_key: Column name for the final ground-truth answer.
+            file_name_key: Column name with optional associated file name (may be empty).
+            skip_file_attachments: If True, skip rows that have a non-empty file attachment.
+        """
+        required = {task_id_key, question_key, level_key, final_answer_key}
+        super().__init__(source=source, split=split, required_keys=required)
+        self.task_id_key = task_id_key
+        self.question_key = question_key
+        self.level_key = level_key
+        self.final_answer_key = final_answer_key
+        self.file_name_key = file_name_key
+        self.skip_file_attachments = skip_file_attachments
+    async def map(self, dataset: Iterable[dict]) -> Iterable[GaiaData]:
+        """
+        Map the dataset to the GAIA evaluation data schema.
+        Args:
+            dataset: The dataset to map.
+        Returns:
+            The GAIA evaluation data rows.
+        """
+        return [
+            GaiaData(
+                task_id=str(row.get(self.task_id_key, "")),
+                question=str(row.get(self.question_key, "")),
+                level=int(row.get(self.level_key, 1)),
+                reference_answer=str(row.get(self.final_answer_key, "")),
+                file_name=(row.get(self.file_name_key) or None),
+            )
+            for row in dataset
+            if (
+                not self.skip_file_attachments
+                or not row.get(self.file_name_key)
+                or str(row.get(self.file_name_key)).strip() == ""
+            )
+        ]

ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/dataloaders/hotpot_qa.py ADDED Viewed

@@ -0,0 +1,95 @@
+from collections.abc import Iterable
+from typing import Any
+from ragbits.core.sources.base import Source
+from ragbits.evaluate.dataloaders.base import DataLoader
+from ragbits.evaluate.pipelines.hotpot_qa import HotpotQAData
+class HotpotQADataLoader(DataLoader[HotpotQAData]):
+    """
+    HotpotQA evaluation data loader.
+    The source should point to a local/remote JSON file exported from Hugging Face, where each example includes at
+    least the following keys:
+    - "id" (str)
+    - "question" (str)
+    - "answer" (str)
+    - "type" ("bridge" | "comparison")
+    - "level" ("easy" | "medium" | "hard")
+    - "context" (object with keys: "title": list[str], "sentences": list[list[str]])
+    """
+    def __init__(
+        self,
+        source: Source,
+        *,
+        split: str = "data",
+        id_key: str = "id",
+        question_key: str = "question",
+        answer_key: str = "answer",
+        type_key: str = "type",
+        level_key: str = "level",
+        context_key: str = "context",
+        # filter
+        level_filter: str | None = None,  # one of: easy|medium|hard
+    ) -> None:
+        """
+        Initialize the HotpotQA data loader.
+        Args:
+            source: The source to load the data from.
+            split: The split to load the data from.
+            id_key: Column with unique id.
+            question_key: Column with question text.
+            answer_key: Column with ground truth answer.
+            type_key: Column with question type ("bridge" | "comparison").
+            level_key: Column with difficulty ("easy" | "medium" | "hard").
+            context_key: Column with context object containing titles and sentences.
+            level_filter: If provided, return only examples with this level.
+        """
+        required = {id_key, question_key, answer_key, type_key, level_key, context_key}
+        super().__init__(source=source, split=split, required_keys=required)
+        self.id_key = id_key
+        self.question_key = question_key
+        self.answer_key = answer_key
+        self.type_key = type_key
+        self.level_key = level_key
+        self.context_key = context_key
+        self.level_filter = level_filter
+    async def map(self, dataset: Iterable[dict]) -> Iterable[HotpotQAData]:
+        """
+        Map the dataset to the HotpotQA evaluation data schema.
+        Args:
+            dataset: The dataset to map.
+        Returns:
+            The HotpotQA evaluation data rows.
+        """
+        def to_context_rows(context: dict[str, Any]) -> list[str]:
+            titles = context.get("title", []) or []
+            sentences = context.get("sentences", []) or []
+            rows: list[str] = []
+            for title, sent_list in zip(titles, sentences, strict=False):
+                doc_text = "\n".join(sent_list) if isinstance(sent_list, list) else str(sent_list)
+                rows.append(f"{title}\n{doc_text}")
+            if not rows and isinstance(sentences, list):
+                flat = "\n".join([" ".join(s) if isinstance(s, list) else str(s) for s in sentences])
+                rows = [flat]
+            return rows
+        return [
+            HotpotQAData(
+                id=row.get(self.id_key, ""),
+                question=row.get(self.question_key, ""),
+                reference_answer=str(row.get(self.answer_key, "")),
+                qtype=str(row.get(self.type_key, "")),
+                level=(row.get(self.level_key) or "").lower(),
+                reference_context=to_context_rows(row.get(self.context_key, {}) or {}),
+            )
+            for row in dataset
+            if not self.level_filter or (row.get(self.level_key, "").lower() == self.level_filter)
+        ]

ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/dataloaders/human_eval.py ADDED Viewed

@@ -0,0 +1,70 @@
+from collections.abc import Iterable
+from ragbits.core.sources.base import Source
+from ragbits.evaluate.dataloaders.base import DataLoader
+from ragbits.evaluate.pipelines.human_eval import HumanEvalData
+class HumanEvalDataLoader(DataLoader[HumanEvalData]):
+    """
+    HumanEval evaluation data loader.
+    The source should point to a local/remote JSONL file in HumanEval format, where each line is a JSON object
+    with at least the following keys: "
+    - task_id" (str)
+    - "prompt" (str)
+    - "entry_point" (str)
+    - "test" (str)
+    """
+    def __init__(
+        self,
+        source: Source,
+        *,
+        split: str = "data",
+        task_id_key: str = "task_id",
+        prompt_key: str = "prompt",
+        entry_point_key: str = "entry_point",
+        test_key: str = "test",
+        canonical_solution_key: str | None = "canonical_solution",
+    ) -> None:
+        """
+        Initialize the HumanEval data loader.
+        Args:
+            source: The source to load the data from.
+            split: The split to load the data from.
+            task_id_key: Dataset column with the HumanEval task identifier.
+            prompt_key: Dataset column with the Python prompt (function signature and docstring).
+            entry_point_key: Dataset column with the function name to evaluate.
+            test_key: Dataset column with the Python test harness defining `check(candidate)`.
+            canonical_solution_key: Optional dataset column with the reference solution (not used for scoring).
+        """
+        required = {task_id_key, prompt_key, entry_point_key, test_key}
+        super().__init__(source=source, split=split, required_keys=required)
+        self.task_id_key = task_id_key
+        self.prompt_key = prompt_key
+        self.entry_point_key = entry_point_key
+        self.test_key = test_key
+        self.canonical_solution_key = canonical_solution_key
+    async def map(self, dataset: Iterable[dict]) -> Iterable[HumanEvalData]:
+        """
+        Map the dataset to the HumanEval evaluation data schema.
+        Args:
+            dataset: The dataset to map.
+        Returns:
+            The HumanEval evaluation data rows.
+        """
+        return [
+            HumanEvalData(
+                task_id=row.get(self.task_id_key, ""),
+                prompt=row.get(self.prompt_key, ""),
+                entry_point=row.get(self.entry_point_key, ""),
+                test=row.get(self.test_key, ""),
+                canonical_solution=(row.get(self.canonical_solution_key) if self.canonical_solution_key else None),
+            )
+            for row in dataset
+        ]

ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/metrics/gaia.py ADDED Viewed

@@ -0,0 +1,84 @@
+from statistics import mean
+from ragbits.evaluate.metrics.base import Metric
+from ragbits.evaluate.pipelines.gaia import GaiaResult
+class GaiaOutcome(Metric[GaiaResult]):
+    """
+    Computes task success rate over GAIA tasks.
+    Measures the fraction of tasks that were successfully solved.
+    """
+    @staticmethod
+    async def compute(results: list[GaiaResult]) -> dict:
+        """Compute task success rate.
+        Returns:
+            Dictionary with gaia_task_success_rate: fraction of successfully solved tasks.
+        """
+        success_count = sum(1 for r in results if r.task_success)
+        success_rate = (success_count / len(results)) if results else 0.0
+        return {"gaia_task_success_rate": float(success_rate)}
+class GaiaTooling(Metric[GaiaResult]):
+    """
+    Tool utilization and performance metrics:
+    - gaia_tool_trigger_rate: fraction of tasks where tools were used
+    - gaia_avg_num_tool_calls: average number of tool calls per task
+    - gaia_avg_tool_error_count: average number of tool errors per task
+    - averaged_freq: average tool usage/calls per task
+    """
+    @staticmethod
+    async def compute(results: list[GaiaResult]) -> dict:
+        """Compute tool utilization and performance metrics.
+        Returns:
+            Dictionary with tool trigger rate, average tool calls, average errors,
+            and flattened tool frequency usage as numeric metrics.
+        """
+        tool_triggered_count = sum(1 for r in results if r.tool_triggered)
+        tool_trigger_rate = (tool_triggered_count / len(results)) if results else 0.0
+        avg_tool_calls = float(mean(r.num_tool_calls for r in results)) if results else 0.0
+        avg_tool_errors = float(mean(r.tool_error_count for r in results)) if results else 0.0
+        # tool frequency as average per task (mean calls per task per tool)
+        total_tasks = len(results) if results else 1
+        aggregated_counts: dict[str, int] = {}
+        for r in results:
+            if r.tool_names:
+                for name in r.tool_names:
+                    aggregated_counts[name] = aggregated_counts.get(name, 0) + 1
+        averaged_freq: dict[str, float] = {
+            f"gaia_tool_frequency_usage.{name}": (count / total_tasks) for name, count in aggregated_counts.items()
+        }
+        return {
+            "gaia_tool_trigger_rate": float(tool_trigger_rate),
+            "gaia_avg_num_tool_calls": avg_tool_calls,
+            "gaia_avg_tool_error_count": avg_tool_errors,
+            **averaged_freq,
+        }
+class GaiaEfficiency(Metric[GaiaResult]):
+    """
+    Efficiency and resource usage metrics:
+    - gaia_avg_latency_ms: average response latency in milliseconds
+    """
+    @staticmethod
+    async def compute(results: list[GaiaResult]) -> dict:
+        """Compute efficiency and resource usage metrics.
+        Returns:
+            Dictionary with average latency.
+        """
+        avg_latency = float(mean(r.total_latency_ms for r in results)) if results else 0.0
+        return {
+            "gaia_avg_latency_ms": avg_latency,
+        }

ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/metrics/hotpot_qa.py ADDED Viewed

@@ -0,0 +1,51 @@
+from collections import defaultdict
+from collections.abc import Iterable
+from ragbits.evaluate.metrics.base import Metric
+from ragbits.evaluate.pipelines.hotpot_qa import HotpotQAResult
+class HotpotQAExactMatch(Metric[HotpotQAResult]):
+    """Computes EM over HotpotQA by type and overall."""
+    @staticmethod
+    async def compute(results: list[HotpotQAResult]) -> dict:
+        """Compute EM. Returns hotpotqa_<type>_em and hotpotqa_overall_em."""
+        buckets: dict[str, list[float]] = defaultdict(list)
+        for r in results:
+            em = r.em_value
+            t = r.qtype or "unknown"
+            buckets[t].append(em)
+            buckets["overall"].append(em)
+        def avg(vals: Iterable[float]) -> float:
+            lst = list(vals)
+            return float(sum(lst) / len(lst)) if lst else 0.0
+        metrics: dict[str, float] = {}
+        for t, vals in buckets.items():
+            metrics[f"hotpotqa_{t}_em"] = avg(vals)
+        return metrics
+class HotpotQAF1(Metric[HotpotQAResult]):
+    """Computes token-level F1 over HotpotQA by type and overall."""
+    @staticmethod
+    async def compute(results: list[HotpotQAResult]) -> dict:
+        """Compute F1. Returns hotpotqa_<type>_f1 and hotpotqa_overall_f1."""
+        buckets: dict[str, list[float]] = defaultdict(list)
+        for r in results:
+            f1v = r.f1_value
+            t = r.qtype or "unknown"
+            buckets[t].append(f1v)
+            buckets["overall"].append(f1v)
+        def avg(vals: Iterable[float]) -> float:
+            lst = list(vals)
+            return float(sum(lst) / len(lst)) if lst else 0.0
+        metrics: dict[str, float] = {}
+        for t, vals in buckets.items():
+            metrics[f"hotpotqa_{t}_f1"] = avg(vals)
+        return metrics

ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/metrics/human_eval.py ADDED Viewed

@@ -0,0 +1,105 @@
+import math
+from statistics import mean
+from ragbits.evaluate.metrics.base import Metric
+from ragbits.evaluate.pipelines.human_eval import HumanEvalResult
+class HumanEvalPassAtK(Metric[HumanEvalResult]):
+    """
+    Computes pass@k over HumanEval tasks.
+    Measures the fraction of tasks with at least one passing sample out of k attempts.
+    """
+    def __init__(self, k: int = 1) -> None:
+        super().__init__()
+        self.k = k
+    async def compute(self, results: list[HumanEvalResult]) -> dict:
+        """Compute pass@k averaged over tasks.
+        Returns:
+            Dictionary with humaneval_pass@k: fraction of tasks with at least one passing sample.
+        """
+        values = []
+        for r in results:
+            n = len(r.passed_mask)
+            m = sum(1 for x in r.passed_mask if x)
+            k = min(self.k, n)
+            if n == 0 or k == 0:
+                values.append(0.0)
+                continue
+            if m == 0:
+                values.append(0.0)
+                continue
+            if m == n:
+                values.append(1.0)
+                continue
+            # 1 - C(n-m, k) / C(n, k)
+            denom = math.comb(n, k)
+            numer = math.comb(n - m, k) if n - m >= k else 0
+            values.append(1.0 - (numer / denom))
+        return {f"humaneval_pass@{self.k}": float(mean(values)) if values else 0.0}
+class HumanEvalQualityPerf(Metric[HumanEvalResult]):
+    """
+    Code quality and execution performance metrics:
+    - humaneval_compile_rate: fraction of samples that compiled
+    - humaneval_syntax_error_rate: fraction of samples with syntax error (compile failed)
+    - humaneval_assert_fail_rate: fraction of samples that ran but failed assertions
+    - humaneval_runtime_error_rate: fraction of samples with other runtime errors
+    - humaneval_timeout_rate: fraction of samples that timed out
+    - humaneval_tasks_solved: fraction of tasks with any passing sample
+    - humaneval_avg_exec_time_sec: average exec time over compilable runs
+    """
+    @staticmethod
+    async def compute(results: list[HumanEvalResult]) -> dict:
+        """Compute code quality and execution performance metrics.
+        Returns:
+            Dictionary with compile rates, error rates, tasks solved rate, and average execution time.
+        """
+        total_samples = sum(len(r.passed_mask) for r in results)
+        compiled = 0
+        syntax_errors = 0
+        assert_fails = 0
+        runtime_errors = 0
+        timeouts = 0
+        any_pass = sum(1 for r in results if any(r.passed_mask))
+        durations: list[float] = []
+        for r in results:
+            for ok, err, dur in zip(r.compile_ok_mask, r.errors, r.exec_durations_sec, strict=False):
+                if ok:
+                    compiled += 1
+                    durations.append(dur)
+                    if err:
+                        if err.startswith("AssertionError"):
+                            assert_fails += 1
+                        elif err.startswith("TimeoutError"):
+                            timeouts += 1
+                        else:
+                            runtime_errors += 1
+                else:
+                    # Compile failed: count as syntax error
+                    syntax_errors += 1
+        compile_rate = (compiled / total_samples) if total_samples else 0.0
+        syntax_error_rate = (syntax_errors / total_samples) if total_samples else 0.0
+        assert_fail_rate = (assert_fails / total_samples) if total_samples else 0.0
+        runtime_error_rate = (runtime_errors / total_samples) if total_samples else 0.0
+        timeout_rate = (timeouts / total_samples) if total_samples else 0.0
+        tasks_solved = (any_pass / len(results)) if results else 0.0
+        avg_exec_time = float(mean(durations)) if durations else 0.0
+        return {
+            "humaneval_compile_rate": float(compile_rate),
+            "humaneval_syntax_error_rate": float(syntax_error_rate),
+            "humaneval_assert_fail_rate": float(assert_fail_rate),
+            "humaneval_runtime_error_rate": float(runtime_error_rate),
+            "humaneval_timeout_rate": float(timeout_rate),
+            "humaneval_tasks_solved": float(tasks_solved),
+            "humaneval_avg_exec_time_sec": avg_exec_time,
+        }

{ragbits_evaluate-1.4.0.dev202509220622 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/pipelines/__init__.py RENAMED Viewed

@@ -2,8 +2,19 @@ from ragbits.core.utils.config_handling import WithConstructionConfig
 from ragbits.document_search import DocumentSearch
 from ragbits.evaluate.pipelines.base import EvaluationData, EvaluationPipeline, EvaluationResult
 from ragbits.evaluate.pipelines.document_search import DocumentSearchPipeline
+from ragbits.evaluate.pipelines.gaia import GaiaPipeline
+from ragbits.evaluate.pipelines.hotpot_qa import HotpotQAPipeline
+from ragbits.evaluate.pipelines.human_eval import HumanEvalPipeline
-__all__ = ["DocumentSearchPipeline", "EvaluationData", "EvaluationPipeline", "EvaluationResult"]
+__all__ = [
+    "DocumentSearchPipeline",
+    "EvaluationData",
+    "EvaluationPipeline",
+    "EvaluationResult",
+    "GaiaPipeline",
+    "HotpotQAPipeline",
+    "HumanEvalPipeline",
+]
 _target_to_evaluation_pipeline: dict[type[WithConstructionConfig], type[EvaluationPipeline]] = {
     DocumentSearch: DocumentSearchPipeline,

ragbits-evaluate 1.4.0.dev202509220622__tar.gz → 1.4.0.dev202511160236__tar.gz

ragbits-evaluate 1.4.0.dev202509220622tar.gz → 1.4.0.dev202511160236tar.gz