PyPI - ragbits-evaluate - Versions diffs - 0.5.0__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl - Mend

ragbits-evaluate 0.5.0py3-none-any.whl → 1.4.0.dev202602030301py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

ragbits/evaluate/agent_simulation/__init__.py +87 -0
ragbits/evaluate/agent_simulation/context.py +118 -0
ragbits/evaluate/agent_simulation/conversation.py +333 -0
ragbits/evaluate/agent_simulation/deepeval_evaluator.py +92 -0
ragbits/evaluate/agent_simulation/logger.py +165 -0
ragbits/evaluate/agent_simulation/metrics/__init__.py +19 -0
ragbits/evaluate/agent_simulation/metrics/builtin.py +221 -0
ragbits/evaluate/agent_simulation/metrics/collectors.py +142 -0
ragbits/evaluate/agent_simulation/models.py +37 -0
ragbits/evaluate/agent_simulation/results.py +200 -0
ragbits/evaluate/agent_simulation/scenarios.py +129 -0
ragbits/evaluate/agent_simulation/simulation.py +243 -0
ragbits/evaluate/cli.py +150 -0
ragbits/evaluate/config.py +11 -0
ragbits/evaluate/dataloaders/__init__.py +3 -0
ragbits/evaluate/dataloaders/base.py +95 -0
ragbits/evaluate/dataloaders/document_search.py +61 -0
ragbits/evaluate/dataloaders/exceptions.py +25 -0
ragbits/evaluate/dataloaders/gaia.py +78 -0
ragbits/evaluate/dataloaders/hotpot_qa.py +95 -0
ragbits/evaluate/dataloaders/human_eval.py +70 -0
ragbits/evaluate/dataloaders/question_answer.py +56 -0
ragbits/evaluate/dataset_generator/pipeline.py +4 -4
ragbits/evaluate/dataset_generator/prompts/qa.py +2 -4
ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +2 -4
ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +3 -5
ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +3 -3
ragbits/evaluate/evaluator.py +178 -50
ragbits/evaluate/factories/__init__.py +42 -0
ragbits/evaluate/metrics/__init__.py +2 -23
ragbits/evaluate/metrics/base.py +40 -17
ragbits/evaluate/metrics/document_search.py +40 -23
ragbits/evaluate/metrics/gaia.py +84 -0
ragbits/evaluate/metrics/hotpot_qa.py +51 -0
ragbits/evaluate/metrics/human_eval.py +105 -0
ragbits/evaluate/metrics/question_answer.py +222 -0
ragbits/evaluate/optimizer.py +138 -86
ragbits/evaluate/pipelines/__init__.py +37 -0
ragbits/evaluate/pipelines/base.py +34 -10
ragbits/evaluate/pipelines/document_search.py +72 -67
ragbits/evaluate/pipelines/gaia.py +249 -0
ragbits/evaluate/pipelines/hotpot_qa.py +342 -0
ragbits/evaluate/pipelines/human_eval.py +323 -0
ragbits/evaluate/pipelines/question_answer.py +96 -0
ragbits/evaluate/utils.py +86 -59
{ragbits_evaluate-0.5.0.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA +33 -9
ragbits_evaluate-1.4.0.dev202602030301.dist-info/RECORD +59 -0
{ragbits_evaluate-0.5.0.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL +1 -1
ragbits/evaluate/callbacks/base.py +0 -22
ragbits/evaluate/callbacks/neptune.py +0 -26
ragbits/evaluate/loaders/__init__.py +0 -21
ragbits/evaluate/loaders/base.py +0 -24
ragbits/evaluate/loaders/hf.py +0 -25
ragbits_evaluate-0.5.0.dist-info/RECORD +0 -33
/ragbits/evaluate/{callbacks/__init__.py → py.typed} +0 -0

ragbits/evaluate/metrics/gaia.py ADDED Viewed

@@ -0,0 +1,84 @@
+from statistics import mean
+from ragbits.evaluate.metrics.base import Metric
+from ragbits.evaluate.pipelines.gaia import GaiaResult
+class GaiaOutcome(Metric[GaiaResult]):
+    """
+    Computes task success rate over GAIA tasks.
+    Measures the fraction of tasks that were successfully solved.
+    """
+    @staticmethod
+    async def compute(results: list[GaiaResult]) -> dict:
+        """Compute task success rate.
+        Returns:
+            Dictionary with gaia_task_success_rate: fraction of successfully solved tasks.
+        """
+        success_count = sum(1 for r in results if r.task_success)
+        success_rate = (success_count / len(results)) if results else 0.0
+        return {"gaia_task_success_rate": float(success_rate)}
+class GaiaTooling(Metric[GaiaResult]):
+    """
+    Tool utilization and performance metrics:
+    - gaia_tool_trigger_rate: fraction of tasks where tools were used
+    - gaia_avg_num_tool_calls: average number of tool calls per task
+    - gaia_avg_tool_error_count: average number of tool errors per task
+    - averaged_freq: average tool usage/calls per task
+    """
+    @staticmethod
+    async def compute(results: list[GaiaResult]) -> dict:
+        """Compute tool utilization and performance metrics.
+        Returns:
+            Dictionary with tool trigger rate, average tool calls, average errors,
+            and flattened tool frequency usage as numeric metrics.
+        """
+        tool_triggered_count = sum(1 for r in results if r.tool_triggered)
+        tool_trigger_rate = (tool_triggered_count / len(results)) if results else 0.0
+        avg_tool_calls = float(mean(r.num_tool_calls for r in results)) if results else 0.0
+        avg_tool_errors = float(mean(r.tool_error_count for r in results)) if results else 0.0
+        # tool frequency as average per task (mean calls per task per tool)
+        total_tasks = len(results) if results else 1
+        aggregated_counts: dict[str, int] = {}
+        for r in results:
+            if r.tool_names:
+                for name in r.tool_names:
+                    aggregated_counts[name] = aggregated_counts.get(name, 0) + 1
+        averaged_freq: dict[str, float] = {
+            f"gaia_tool_frequency_usage.{name}": (count / total_tasks) for name, count in aggregated_counts.items()
+        }
+        return {
+            "gaia_tool_trigger_rate": float(tool_trigger_rate),
+            "gaia_avg_num_tool_calls": avg_tool_calls,
+            "gaia_avg_tool_error_count": avg_tool_errors,
+            **averaged_freq,
+        }
+class GaiaEfficiency(Metric[GaiaResult]):
+    """
+    Efficiency and resource usage metrics:
+    - gaia_avg_latency_ms: average response latency in milliseconds
+    """
+    @staticmethod
+    async def compute(results: list[GaiaResult]) -> dict:
+        """Compute efficiency and resource usage metrics.
+        Returns:
+            Dictionary with average latency.
+        """
+        avg_latency = float(mean(r.total_latency_ms for r in results)) if results else 0.0
+        return {
+            "gaia_avg_latency_ms": avg_latency,
+        }

ragbits/evaluate/metrics/hotpot_qa.py ADDED Viewed

@@ -0,0 +1,51 @@
+from collections import defaultdict
+from collections.abc import Iterable
+from ragbits.evaluate.metrics.base import Metric
+from ragbits.evaluate.pipelines.hotpot_qa import HotpotQAResult
+class HotpotQAExactMatch(Metric[HotpotQAResult]):
+    """Computes EM over HotpotQA by type and overall."""
+    @staticmethod
+    async def compute(results: list[HotpotQAResult]) -> dict:
+        """Compute EM. Returns hotpotqa_<type>_em and hotpotqa_overall_em."""
+        buckets: dict[str, list[float]] = defaultdict(list)
+        for r in results:
+            em = r.em_value
+            t = r.qtype or "unknown"
+            buckets[t].append(em)
+            buckets["overall"].append(em)
+        def avg(vals: Iterable[float]) -> float:
+            lst = list(vals)
+            return float(sum(lst) / len(lst)) if lst else 0.0
+        metrics: dict[str, float] = {}
+        for t, vals in buckets.items():
+            metrics[f"hotpotqa_{t}_em"] = avg(vals)
+        return metrics
+class HotpotQAF1(Metric[HotpotQAResult]):
+    """Computes token-level F1 over HotpotQA by type and overall."""
+    @staticmethod
+    async def compute(results: list[HotpotQAResult]) -> dict:
+        """Compute F1. Returns hotpotqa_<type>_f1 and hotpotqa_overall_f1."""
+        buckets: dict[str, list[float]] = defaultdict(list)
+        for r in results:
+            f1v = r.f1_value
+            t = r.qtype or "unknown"
+            buckets[t].append(f1v)
+            buckets["overall"].append(f1v)
+        def avg(vals: Iterable[float]) -> float:
+            lst = list(vals)
+            return float(sum(lst) / len(lst)) if lst else 0.0
+        metrics: dict[str, float] = {}
+        for t, vals in buckets.items():
+            metrics[f"hotpotqa_{t}_f1"] = avg(vals)
+        return metrics

ragbits/evaluate/metrics/human_eval.py ADDED Viewed

@@ -0,0 +1,105 @@
+import math
+from statistics import mean
+from ragbits.evaluate.metrics.base import Metric
+from ragbits.evaluate.pipelines.human_eval import HumanEvalResult
+class HumanEvalPassAtK(Metric[HumanEvalResult]):
+    """
+    Computes pass@k over HumanEval tasks.
+    Measures the fraction of tasks with at least one passing sample out of k attempts.
+    """
+    def __init__(self, k: int = 1) -> None:
+        super().__init__()
+        self.k = k
+    async def compute(self, results: list[HumanEvalResult]) -> dict:
+        """Compute pass@k averaged over tasks.
+        Returns:
+            Dictionary with humaneval_pass@k: fraction of tasks with at least one passing sample.
+        """
+        values = []
+        for r in results:
+            n = len(r.passed_mask)
+            m = sum(1 for x in r.passed_mask if x)
+            k = min(self.k, n)
+            if n == 0 or k == 0:
+                values.append(0.0)
+                continue
+            if m == 0:
+                values.append(0.0)
+                continue
+            if m == n:
+                values.append(1.0)
+                continue
+            # 1 - C(n-m, k) / C(n, k)
+            denom = math.comb(n, k)
+            numer = math.comb(n - m, k) if n - m >= k else 0
+            values.append(1.0 - (numer / denom))
+        return {f"humaneval_pass@{self.k}": float(mean(values)) if values else 0.0}
+class HumanEvalQualityPerf(Metric[HumanEvalResult]):
+    """
+    Code quality and execution performance metrics:
+    - humaneval_compile_rate: fraction of samples that compiled
+    - humaneval_syntax_error_rate: fraction of samples with syntax error (compile failed)
+    - humaneval_assert_fail_rate: fraction of samples that ran but failed assertions
+    - humaneval_runtime_error_rate: fraction of samples with other runtime errors
+    - humaneval_timeout_rate: fraction of samples that timed out
+    - humaneval_tasks_solved: fraction of tasks with any passing sample
+    - humaneval_avg_exec_time_sec: average exec time over compilable runs
+    """
+    @staticmethod
+    async def compute(results: list[HumanEvalResult]) -> dict:
+        """Compute code quality and execution performance metrics.
+        Returns:
+            Dictionary with compile rates, error rates, tasks solved rate, and average execution time.
+        """
+        total_samples = sum(len(r.passed_mask) for r in results)
+        compiled = 0
+        syntax_errors = 0
+        assert_fails = 0
+        runtime_errors = 0
+        timeouts = 0
+        any_pass = sum(1 for r in results if any(r.passed_mask))
+        durations: list[float] = []
+        for r in results:
+            for ok, err, dur in zip(r.compile_ok_mask, r.errors, r.exec_durations_sec, strict=False):
+                if ok:
+                    compiled += 1
+                    durations.append(dur)
+                    if err:
+                        if err.startswith("AssertionError"):
+                            assert_fails += 1
+                        elif err.startswith("TimeoutError"):
+                            timeouts += 1
+                        else:
+                            runtime_errors += 1
+                else:
+                    # Compile failed: count as syntax error
+                    syntax_errors += 1
+        compile_rate = (compiled / total_samples) if total_samples else 0.0
+        syntax_error_rate = (syntax_errors / total_samples) if total_samples else 0.0
+        assert_fail_rate = (assert_fails / total_samples) if total_samples else 0.0
+        runtime_error_rate = (runtime_errors / total_samples) if total_samples else 0.0
+        timeout_rate = (timeouts / total_samples) if total_samples else 0.0
+        tasks_solved = (any_pass / len(results)) if results else 0.0
+        avg_exec_time = float(mean(durations)) if durations else 0.0
+        return {
+            "humaneval_compile_rate": float(compile_rate),
+            "humaneval_syntax_error_rate": float(syntax_error_rate),
+            "humaneval_assert_fail_rate": float(assert_fail_rate),
+            "humaneval_runtime_error_rate": float(runtime_error_rate),
+            "humaneval_timeout_rate": float(timeout_rate),
+            "humaneval_tasks_solved": float(tasks_solved),
+            "humaneval_avg_exec_time_sec": avg_exec_time,
+        }

ragbits/evaluate/metrics/question_answer.py ADDED Viewed

@@ -0,0 +1,222 @@
+import asyncio
+from abc import ABC, abstractmethod
+from asyncio import AbstractEventLoop
+from itertools import chain
+from typing import Generic, TypeVar
+from typing_extensions import Self
+from ragbits.agents.types import QuestionAnswerPromptOutputT
+from ragbits.core.llms.base import LLM
+from ragbits.core.utils.helpers import batched
+from ragbits.evaluate.metrics.base import Metric
+from ragbits.evaluate.pipelines.question_answer import QuestionAnswerResult
+try:
+    from continuous_eval.llm_factory import LLMInterface
+    from continuous_eval.metrics.base import LLMBasedMetric
+    from continuous_eval.metrics.generation.text import (
+        LLMBasedAnswerCorrectness,
+        LLMBasedAnswerRelevance,
+        LLMBasedFaithfulness,
+        LLMBasedStyleConsistency,
+    )
+except ModuleNotFoundError:
+    from continuous_eval.llms.base import LLMInterface
+    from continuous_eval.metrics import Metric as LLMBasedMetric
+    from continuous_eval.metrics.generation.text import (
+        AnswerCorrectness as LLMBasedAnswerCorrectness,
+    )
+    from continuous_eval.metrics.generation.text import (
+        AnswerRelevance as LLMBasedAnswerRelevance,
+    )
+    from continuous_eval.metrics.generation.text import (
+        Faithfulness as LLMBasedFaithfulness,
+    )
+    from continuous_eval.metrics.generation.text import (
+        StyleConsistency as LLMBasedStyleConsistency,
+    )
+MetricT = TypeVar("MetricT", bound=LLMBasedMetric)
+class _MetricLMM(LLMInterface):
+    """
+    Implementation of required interface of Relari generative metrics based on LiteLMM.
+    """
+    def __init__(self, llm: LLM, loop: AbstractEventLoop) -> None:
+        self._llm = llm
+        self._loop = loop
+    def run(self, prompt: dict[str, str], temperature: float = 0, max_tokens: int = 1024) -> str:
+        formatted_prompt = [
+            {"role": "system", "content": prompt["system_prompt"]},
+            {"role": "user", "content": prompt["user_prompt"]},
+        ]
+        options = self._llm.options_cls(
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+        return asyncio.run_coroutine_threadsafe(
+            self._llm.generate(formatted_prompt, options=options),
+            self._loop,
+        ).result()
+class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
+    """
+    Metric for question answer evaluation based on Relari backend.
+    More details can be found [here](https://docs.relari.ai/category/text-generation).
+    """
+    metric_cls: type[MetricT]
+    def __init__(self, llm: LLM, batch_size: int = 15, weight: float = 1.0) -> None:
+        """
+        Initialize the agent metric.
+        Args:
+            llm: Judge LLM instance.
+            batch_size: Batch size for metric computation.
+            weight: Metric value weight in the final score, used during optimization.
+        """
+        super().__init__(weight=weight)
+        self.llm = llm
+        self.batch_size = batch_size
+    @classmethod
+    def from_config(cls, config: dict) -> Self:
+        """
+        Create an instance of `QuestionAnswerMetric` from a configuration dictionary.
+        Args:
+            config: A dictionary containing configuration settings for the metric.
+        Returns:
+            An instance of the metric class initialized with the provided configuration.
+        """
+        config["llm"] = LLM.from_config(config["llm"])
+        config["batch_size"] = config.get("batch_size", 15)
+        config["weight"] = config.get("weight", 1.0)
+        return super().from_config(config)
+    async def compute(self, results: list[QuestionAnswerResult[QuestionAnswerPromptOutputT]]) -> dict:
+        """
+        Compute the metric.
+        Args:
+            results: The evaluation results.
+        Returns:
+            The computed metric.
+        """
+        metric = self.metric_cls(_MetricLMM(self.llm, loop=asyncio.get_running_loop()))
+        metric_results = chain.from_iterable(
+            [
+                await asyncio.gather(*[asyncio.to_thread(self._call_metric, metric, result) for result in batch])
+                for batch in batched(results, self.batch_size)
+            ]
+        )
+        return metric.aggregate(list(metric_results))
+    @staticmethod
+    @abstractmethod
+    def _call_metric(metric: MetricT, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
+        """
+        Call the metric with the proper arguments.
+        """
+class QuestionAnswerAnswerCorrectness(QuestionAnswerMetric[LLMBasedAnswerCorrectness]):
+    """
+    Metric checking answer correctness based on LLM.
+    More details can be found [here](https://docs.relari.ai/metrics/Generation/LLM-Based/llm_correctness).
+    """
+    metric_cls: type[LLMBasedAnswerCorrectness] = LLMBasedAnswerCorrectness
+    @staticmethod
+    def _call_metric(
+        metric: LLMBasedAnswerCorrectness,
+        result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
+    ) -> dict:
+        return metric(
+            question=result.question,
+            answer=(
+                result.predicted_result.content
+                if isinstance(result.predicted_result.content, str)
+                else result.predicted_result.content.answer
+            ),
+            ground_truth_answers=result.reference_answer,
+        )
+class QuestionAnswerAnswerFaithfulness(QuestionAnswerMetric[LLMBasedFaithfulness]):
+    """
+    Metric checking answer faithfulness based on LLM.
+    More details can be found [here](https://docs.relari.ai/metrics/Generation/LLM-Based/llm_faithfulness).
+    """
+    metric_cls: type[LLMBasedFaithfulness] = LLMBasedFaithfulness
+    @staticmethod
+    def _call_metric(
+        metric: LLMBasedFaithfulness,
+        result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
+    ) -> dict:
+        return metric(
+            question=result.question,
+            answer=(
+                result.predicted_result.content
+                if isinstance(result.predicted_result.content, str)
+                else result.predicted_result.content.answer
+            ),
+            retrieved_context=result.reference_context,
+        )
+class QuestionAnswerAnswerRelevance(QuestionAnswerMetric[LLMBasedAnswerRelevance]):
+    """
+    Metric checking answer relevance based on LLM.
+    More details can be found [here](https://docs.relari.ai/metrics/Generation/LLM-Based/llm_relevance).
+    """
+    metric_cls: type[LLMBasedAnswerRelevance] = LLMBasedAnswerRelevance
+    @staticmethod
+    def _call_metric(
+        metric: LLMBasedAnswerRelevance,
+        result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
+    ) -> dict:
+        return metric(
+            question=result.question,
+            answer=(
+                result.predicted_result.content
+                if isinstance(result.predicted_result.content, str)
+                else result.predicted_result.content.answer
+            ),
+        )
+class QuestionAnswerAnswerConsistency(QuestionAnswerMetric[LLMBasedStyleConsistency]):
+    """
+    Metric checking answer relevance based on LLM.
+    More details can be found [here](https://docs.relari.ai/metrics/Generation/LLM-Based/llm_style).
+    """
+    metric_cls: type[LLMBasedStyleConsistency] = LLMBasedStyleConsistency
+    @staticmethod
+    def _call_metric(
+        metric: LLMBasedStyleConsistency,
+        result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
+    ) -> dict:
+        return metric(
+            answer=(
+                result.predicted_result.content
+                if isinstance(result.predicted_result.content, str)
+                else result.predicted_result.content.answer
+            ),
+            ground_truth_answers=result.reference_answer,
+        )

ragbits-evaluate 0.5.0__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl

ragbits-evaluate 0.5.0py3-none-any.whl → 1.4.0.dev202602030301py3-none-any.whl