PyPI - ragbits-evaluate - Versions diffs - 0.5.0__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl - Mend

ragbits-evaluate 0.5.0py3-none-any.whl → 1.4.0.dev202602030301py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

ragbits/evaluate/agent_simulation/__init__.py +87 -0
ragbits/evaluate/agent_simulation/context.py +118 -0
ragbits/evaluate/agent_simulation/conversation.py +333 -0
ragbits/evaluate/agent_simulation/deepeval_evaluator.py +92 -0
ragbits/evaluate/agent_simulation/logger.py +165 -0
ragbits/evaluate/agent_simulation/metrics/__init__.py +19 -0
ragbits/evaluate/agent_simulation/metrics/builtin.py +221 -0
ragbits/evaluate/agent_simulation/metrics/collectors.py +142 -0
ragbits/evaluate/agent_simulation/models.py +37 -0
ragbits/evaluate/agent_simulation/results.py +200 -0
ragbits/evaluate/agent_simulation/scenarios.py +129 -0
ragbits/evaluate/agent_simulation/simulation.py +243 -0
ragbits/evaluate/cli.py +150 -0
ragbits/evaluate/config.py +11 -0
ragbits/evaluate/dataloaders/__init__.py +3 -0
ragbits/evaluate/dataloaders/base.py +95 -0
ragbits/evaluate/dataloaders/document_search.py +61 -0
ragbits/evaluate/dataloaders/exceptions.py +25 -0
ragbits/evaluate/dataloaders/gaia.py +78 -0
ragbits/evaluate/dataloaders/hotpot_qa.py +95 -0
ragbits/evaluate/dataloaders/human_eval.py +70 -0
ragbits/evaluate/dataloaders/question_answer.py +56 -0
ragbits/evaluate/dataset_generator/pipeline.py +4 -4
ragbits/evaluate/dataset_generator/prompts/qa.py +2 -4
ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +2 -4
ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +3 -5
ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +3 -3
ragbits/evaluate/evaluator.py +178 -50
ragbits/evaluate/factories/__init__.py +42 -0
ragbits/evaluate/metrics/__init__.py +2 -23
ragbits/evaluate/metrics/base.py +40 -17
ragbits/evaluate/metrics/document_search.py +40 -23
ragbits/evaluate/metrics/gaia.py +84 -0
ragbits/evaluate/metrics/hotpot_qa.py +51 -0
ragbits/evaluate/metrics/human_eval.py +105 -0
ragbits/evaluate/metrics/question_answer.py +222 -0
ragbits/evaluate/optimizer.py +138 -86
ragbits/evaluate/pipelines/__init__.py +37 -0
ragbits/evaluate/pipelines/base.py +34 -10
ragbits/evaluate/pipelines/document_search.py +72 -67
ragbits/evaluate/pipelines/gaia.py +249 -0
ragbits/evaluate/pipelines/hotpot_qa.py +342 -0
ragbits/evaluate/pipelines/human_eval.py +323 -0
ragbits/evaluate/pipelines/question_answer.py +96 -0
ragbits/evaluate/utils.py +86 -59
{ragbits_evaluate-0.5.0.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA +33 -9
ragbits_evaluate-1.4.0.dev202602030301.dist-info/RECORD +59 -0
{ragbits_evaluate-0.5.0.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL +1 -1
ragbits/evaluate/callbacks/base.py +0 -22
ragbits/evaluate/callbacks/neptune.py +0 -26
ragbits/evaluate/loaders/__init__.py +0 -21
ragbits/evaluate/loaders/base.py +0 -24
ragbits/evaluate/loaders/hf.py +0 -25
ragbits_evaluate-0.5.0.dist-info/RECORD +0 -33
/ragbits/evaluate/{callbacks/__init__.py → py.typed} +0 -0

ragbits/evaluate/agent_simulation/logger.py ADDED Viewed

@@ -0,0 +1,165 @@
+"""Logging functionality for agent simulation scenarios."""
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+from zoneinfo import ZoneInfo
+from ragbits.agents.tool import ToolCallResult
+from ragbits.core.llms import Usage
+from ragbits.evaluate.agent_simulation.models import Personality, Scenario, Task
+class ConversationLogger:
+    """Handles logging of conversation sessions to a file."""
+    def __init__(self, log_file: str | None) -> None:
+        """Initialize logger with optional log file path."""
+        self.log_path: Path | None = None
+        if log_file:
+            self.log_path = Path(log_file)
+            self.log_path.parent.mkdir(parents=True, exist_ok=True)
+    def initialize_session(
+        self,
+        scenario: Scenario,
+        agent_model_name: str | None,
+        sim_user_model_name: str | None,
+        checker_model_name: str | None,
+        personality: Personality | None = None,
+    ) -> None:
+        """Initialize a new logging session with scenario metadata."""
+        if not self.log_path:
+            return
+        now_warsaw = datetime.now(ZoneInfo("Europe/Warsaw"))
+        with self.log_path.open("a", encoding="utf-8") as f:
+            f.write("\n" + "=" * 80 + "\n")
+            f.write(f"Session start: {now_warsaw.isoformat()}\n")
+            f.write(f"Scenario: {scenario.name}\n")
+            f.write(f"Tasks: {len(scenario.tasks)}\n")
+            for i, task in enumerate(scenario.tasks, 1):
+                f.write(f"  Task {i}: {task.task}\n")
+                f.write(f"    Expected: {task.expected_result}\n")
+            f.write(f"Agent model: {agent_model_name or 'default'}\n")
+            f.write(f"Simulated user model: {sim_user_model_name or 'default'}\n")
+            f.write(f"Goal checker model: {checker_model_name or 'default'}\n")
+            if personality:
+                f.write(f"Personality: {personality.name}\n")
+                f.write(f"Personality description: {personality.description}\n")
+            else:
+                f.write("Personality: none (default)\n")
+    def log_turn(
+        self,
+        turn_idx: int,
+        task: Task | None,
+        user_msg: str,
+        assistant_msg: str | None = None,
+        tool_calls: list[ToolCallResult] | None = None,
+        usage: Usage | None = None,
+    ) -> None:
+        """Log a conversation turn to the log file."""
+        if not self.log_path:
+            return
+        with self.log_path.open("a", encoding="utf-8") as f:
+            if task:
+                f.write(f"Turn {turn_idx} - Task: {task.task}\n")
+            f.write(f"Turn {turn_idx} - User: {user_msg}\n")
+            if assistant_msg:
+                f.write(f"Turn {turn_idx} - Assistant: {assistant_msg}\n")
+            if tool_calls:
+                for tool_call in tool_calls:
+                    f.write(f"Turn {turn_idx} - Tool: {tool_call.name}({tool_call.arguments})\n")
+            if usage:
+                f.write(
+                    f"Turn {turn_idx} - Assistant token usage: {usage.total_tokens} total "
+                    f"({usage.prompt_tokens} prompt + {usage.completion_tokens} completion), "
+                    f"estimated cost: ${usage.estimated_cost:.6f}\n"
+                )
+    def log_task_check(self, turn_idx: int, task_done: bool, reason: str) -> None:
+        """Log task completion check result."""
+        if self.log_path:
+            with self.log_path.open("a", encoding="utf-8") as f:
+                f.write(f"Turn {turn_idx} - Task check: done={task_done} reason={reason}\n")
+    def log_task_transition(self, next_task: Task) -> None:
+        """Log transition to next task."""
+        if self.log_path:
+            with self.log_path.open("a", encoding="utf-8") as f:
+                f.write(f"Moving to next task: {next_task.task}\n")
+    def log_tool_check(
+        self,
+        turn_idx: int,
+        tools_used_correctly: bool,
+        reason: str,
+        tool_calls: list[ToolCallResult],
+    ) -> None:
+        """Log tool usage check result."""
+        if self.log_path:
+            with self.log_path.open("a", encoding="utf-8") as f:
+                tool_names = [tc.name for tc in tool_calls] if tool_calls else []
+                f.write(
+                    f"Turn {turn_idx} - Tool check: appropriate={tools_used_correctly} "
+                    f"tools_called={tool_names} reason={reason}\n"
+                )
+    def log_total_usage(self, usage: Usage) -> None:
+        """Log total assistant token usage for the entire conversation."""
+        if not self.log_path:
+            return
+        with self.log_path.open("a", encoding="utf-8") as f:
+            f.write("\n--- Total Assistant Token Usage ---\n")
+            f.write(
+                f"Total assistant tokens: {usage.total_tokens} "
+                f"({usage.prompt_tokens} prompt + {usage.completion_tokens} completion)\n"
+            )
+            f.write(f"Total estimated cost: ${usage.estimated_cost:.6f}\n")
+            f.write("--- End Total Assistant Token Usage ---\n")
+    def log_deepeval_metrics(self, metrics: dict[str, Any] | dict[str, dict[str, float | str | None]]) -> None:
+        """Log DeepEval evaluation metrics to the log file.
+        Args:
+            metrics: Dictionary of metric names to their evaluation results, or error dict
+        """
+        if not self.log_path:
+            return
+        with self.log_path.open("a", encoding="utf-8") as f:
+            f.write("\n--- DeepEval Evaluation Metrics ---\n")
+            if "error" in metrics and isinstance(metrics["error"], str):
+                # Handle error case
+                f.write(f"DeepEval Error: {metrics['error']}\n")
+            else:
+                # Handle normal metrics case
+                for metric_name, result in metrics.items():
+                    if not isinstance(result, dict):
+                        continue
+                    score = result.get("score")
+                    reason = result.get("reason")
+                    success = result.get("success")
+                    error = result.get("error")
+                    f.write(f"Metric: {metric_name}\n")
+                    if error:
+                        f.write(f"  Error: {error}\n")
+                    else:
+                        if score is not None:
+                            f.write(f"  Score: {score:.4f}\n")
+                        if success is not None:
+                            f.write(f"  Success: {success}\n")
+                        if reason:
+                            f.write(f"  Reason: {reason}\n")
+            f.write("--- End DeepEval Metrics ---\n")
+    def finalize_session(self) -> None:
+        """Finalize the logging session."""
+        if self.log_path:
+            with self.log_path.open("a", encoding="utf-8") as f:
+                end_time = datetime.now(ZoneInfo("Europe/Warsaw")).isoformat()
+                f.write(f"Session end: {end_time}\n")

ragbits/evaluate/agent_simulation/metrics/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""Metrics collection components for agent simulation."""
+from ragbits.evaluate.agent_simulation.metrics.builtin import (
+    LatencyMetricCollector,
+    TokenUsageMetricCollector,
+    ToolUsageMetricCollector,
+)
+from ragbits.evaluate.agent_simulation.metrics.collectors import (
+    CompositeMetricCollector,
+    MetricCollector,
+)
+__all__ = [
+    "CompositeMetricCollector",
+    "LatencyMetricCollector",
+    "MetricCollector",
+    "TokenUsageMetricCollector",
+    "ToolUsageMetricCollector",
+]

ragbits/evaluate/agent_simulation/metrics/builtin.py ADDED Viewed

@@ -0,0 +1,221 @@
+"""Built-in metric collectors for common simulation metrics."""
+from __future__ import annotations
+import time
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from ragbits.evaluate.agent_simulation.results import TurnResult
+class LatencyMetricCollector:
+    """Tracks response latency per turn.
+    Measures the wall-clock time from turn start to turn end,
+    providing average, min, max, and per-turn latency metrics.
+    Example:
+        >>> collector = LatencyMetricCollector()
+        >>> result = await run_simulation(
+        ...     scenario=scenario,
+        ...     chat=chat,
+        ...     metric_collectors=[collector],
+        ... )
+        >>> print(result.metrics.custom["latency_avg_ms"])
+    """
+    def __init__(self) -> None:
+        """Initialize the latency collector."""
+        self._turn_start: float | None = None
+        self._latencies: list[float] = []
+    def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
+        """Record the start time for this turn.
+        Args:
+            turn_index: 1-based index of the current turn.
+            task_index: 0-based index of the current task.
+            user_message: The user message (unused).
+        """
+        self._turn_start = time.perf_counter()
+    def on_turn_end(self, turn_result: TurnResult) -> None:
+        """Calculate and store the latency for this turn.
+        Args:
+            turn_result: The result of the completed turn (unused directly).
+        """
+        if self._turn_start is not None:
+            latency_ms = (time.perf_counter() - self._turn_start) * 1000
+            self._latencies.append(latency_ms)
+            self._turn_start = None
+    def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
+        """Return latency metrics.
+        Args:
+            all_turns: List of all turn results (unused).
+        Returns:
+            Dictionary with latency_avg_ms, latency_max_ms, latency_min_ms,
+            and latency_per_turn_ms.
+        """
+        if not self._latencies:
+            return {}
+        return {
+            "latency_avg_ms": sum(self._latencies) / len(self._latencies),
+            "latency_max_ms": max(self._latencies),
+            "latency_min_ms": min(self._latencies),
+            "latency_per_turn_ms": self._latencies.copy(),
+        }
+    def reset(self) -> None:
+        """Reset collector state for a new conversation."""
+        self._turn_start = None
+        self._latencies = []
+class TokenUsageMetricCollector:
+    """Tracks token usage and estimated cost per turn.
+    Aggregates token counts from each turn to provide total and
+    per-turn token usage statistics.
+    Example:
+        >>> collector = TokenUsageMetricCollector()
+        >>> result = await run_simulation(
+        ...     scenario=scenario,
+        ...     chat=chat,
+        ...     metric_collectors=[collector],
+        ... )
+        >>> print(result.metrics.custom["tokens_total"])
+    """
+    def __init__(self) -> None:
+        """Initialize the token usage collector."""
+        self._turn_tokens: list[dict[str, int]] = []
+    def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
+        """No-op for token collector.
+        Args:
+            turn_index: 1-based index of the current turn.
+            task_index: 0-based index of the current task.
+            user_message: The user message (unused).
+        """
+        pass
+    def on_turn_end(self, turn_result: TurnResult) -> None:
+        """Record token usage from the turn result.
+        Args:
+            turn_result: The result of the completed turn.
+        """
+        if turn_result.token_usage:
+            self._turn_tokens.append(turn_result.token_usage.copy())
+        else:
+            self._turn_tokens.append({"total": 0, "prompt": 0, "completion": 0})
+    def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
+        """Return aggregated token usage metrics.
+        Args:
+            all_turns: List of all turn results (unused).
+        Returns:
+            Dictionary with tokens_total, tokens_prompt, tokens_completion,
+            tokens_avg_per_turn, and tokens_per_turn.
+        """
+        if not self._turn_tokens:
+            return {}
+        total = sum(t.get("total", 0) for t in self._turn_tokens)
+        prompt = sum(t.get("prompt", 0) for t in self._turn_tokens)
+        completion = sum(t.get("completion", 0) for t in self._turn_tokens)
+        return {
+            "tokens_total": total,
+            "tokens_prompt": prompt,
+            "tokens_completion": completion,
+            "tokens_avg_per_turn": total / len(self._turn_tokens) if self._turn_tokens else 0,
+            "tokens_per_turn": [t.get("total", 0) for t in self._turn_tokens],
+        }
+    def reset(self) -> None:
+        """Reset collector state for a new conversation."""
+        self._turn_tokens = []
+class ToolUsageMetricCollector:
+    """Tracks tool call patterns during the conversation.
+    Records which tools were called, how often, and on which turns,
+    providing insights into agent behavior.
+    Example:
+        >>> collector = ToolUsageMetricCollector()
+        >>> result = await run_simulation(
+        ...     scenario=scenario,
+        ...     chat=chat,
+        ...     metric_collectors=[collector],
+        ... )
+        >>> print(result.metrics.custom["tools_unique"])
+    """
+    def __init__(self) -> None:
+        """Initialize the tool usage collector."""
+        self._tool_calls: list[list[str]] = []  # tool names per turn
+        self._tool_counts: dict[str, int] = {}  # total count per tool
+    def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
+        """No-op for tool usage collector.
+        Args:
+            turn_index: 1-based index of the current turn.
+            task_index: 0-based index of the current task.
+            user_message: The user message (unused).
+        """
+        pass
+    def on_turn_end(self, turn_result: TurnResult) -> None:
+        """Record tool calls from the turn result.
+        Args:
+            turn_result: The result of the completed turn.
+        """
+        tool_names: list[str] = []
+        if turn_result.tool_calls:
+            for tc in turn_result.tool_calls:
+                name = tc.get("name", "unknown")
+                tool_names.append(name)
+                self._tool_counts[name] = self._tool_counts.get(name, 0) + 1
+        self._tool_calls.append(tool_names)
+    def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
+        """Return tool usage metrics.
+        Args:
+            all_turns: List of all turn results (unused).
+        Returns:
+            Dictionary with tools_total_calls, tools_unique, tools_counts,
+            tools_per_turn, and turns_with_tools.
+        """
+        total_calls = sum(len(tools) for tools in self._tool_calls)
+        turns_with_tools = sum(1 for tools in self._tool_calls if tools)
+        return {
+            "tools_total_calls": total_calls,
+            "tools_unique": list(self._tool_counts.keys()),
+            "tools_counts": self._tool_counts.copy(),
+            "tools_per_turn": self._tool_calls.copy(),
+            "turns_with_tools": turns_with_tools,
+        }
+    def reset(self) -> None:
+        """Reset collector state for a new conversation."""
+        self._tool_calls = []
+        self._tool_counts = {}

ragbits/evaluate/agent_simulation/metrics/collectors.py ADDED Viewed

@@ -0,0 +1,142 @@
+"""Base protocol and composite collector for metrics collection."""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
+if TYPE_CHECKING:
+    from ragbits.evaluate.agent_simulation.results import TurnResult
+@runtime_checkable
+class MetricCollector(Protocol):
+    """Protocol for collecting metrics during conversation simulation.
+    Implement this protocol to create custom metric collectors that can
+    be passed to run_simulation(). Collectors receive callbacks at various
+    points during the simulation lifecycle.
+    Example:
+        >>> class CustomCollector:
+        ...     def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
+        ...         print(f"Turn {turn_index} starting")
+        ...
+        ...     def on_turn_end(self, turn_result: TurnResult) -> None:
+        ...         print(f"Turn completed: {turn_result.task_completed}")
+        ...
+        ...     def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
+        ...         return {"total_turns_tracked": len(all_turns)}
+        ...
+        ...     def reset(self) -> None:
+        ...         pass
+    """
+    def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
+        """Called before agent processes a turn.
+        Args:
+            turn_index: 1-based index of the current turn.
+            task_index: 0-based index of the current task.
+            user_message: The user message being sent to the agent.
+        """
+        ...
+    def on_turn_end(self, turn_result: TurnResult) -> None:
+        """Called after a turn completes.
+        Args:
+            turn_result: The result of the completed turn.
+        """
+        ...
+    def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
+        """Called when the conversation ends, returns computed metrics.
+        Args:
+            all_turns: List of all turn results from the conversation.
+        Returns:
+            Dictionary of metric names to values.
+        """
+        ...
+    def reset(self) -> None:
+        """Reset collector state for a new conversation."""
+        ...
+class CompositeMetricCollector:
+    """Combines multiple metric collectors into a single interface.
+    This collector delegates all method calls to its child collectors,
+    aggregating their results at the end of the conversation.
+    Example:
+        >>> from ragbits.evaluate.agent_simulation.metrics import (
+        ...     LatencyMetricCollector,
+        ...     TokenUsageMetricCollector,
+        ...     CompositeMetricCollector,
+        ... )
+        >>> composite = CompositeMetricCollector(
+        ...     [
+        ...         LatencyMetricCollector(),
+        ...         TokenUsageMetricCollector(),
+        ...     ]
+        ... )
+    """
+    def __init__(self, collectors: list[MetricCollector] | None = None) -> None:
+        """Initialize with a list of metric collectors.
+        Args:
+            collectors: List of collectors to combine. Defaults to empty list.
+        """
+        self._collectors: list[MetricCollector] = collectors or []
+    def add(self, collector: MetricCollector) -> None:
+        """Add a collector to the composite.
+        Args:
+            collector: Collector to add.
+        """
+        self._collectors.append(collector)
+    def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
+        """Delegate to all child collectors.
+        Args:
+            turn_index: 1-based index of the current turn.
+            task_index: 0-based index of the current task.
+            user_message: The user message being sent to the agent.
+        """
+        for collector in self._collectors:
+            collector.on_turn_start(turn_index, task_index, user_message)
+    def on_turn_end(self, turn_result: TurnResult) -> None:
+        """Delegate to all child collectors.
+        Args:
+            turn_result: The result of the completed turn.
+        """
+        for collector in self._collectors:
+            collector.on_turn_end(turn_result)
+    def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
+        """Aggregate metrics from all child collectors.
+        Args:
+            all_turns: List of all turn results from the conversation.
+        Returns:
+            Dictionary combining all collector metrics.
+        """
+        combined: dict[str, Any] = {}
+        for collector in self._collectors:
+            metrics = collector.on_conversation_end(all_turns)
+            combined.update(metrics)
+        return combined
+    def reset(self) -> None:
+        """Reset all child collectors."""
+        for collector in self._collectors:
+            collector.reset()

ragbits/evaluate/agent_simulation/models.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Data models for agent simulation scenarios."""
+from dataclasses import dataclass
+@dataclass
+class Turn:
+    """A single conversation turn between user and assistant."""
+    user: str
+    assistant: str
+@dataclass
+class Task:
+    """A single task with its expected result."""
+    task: str
+    expected_result: str
+    expected_tools: list[str] | None = None
+    """Optional list of tool names that should be used to complete this task."""
+@dataclass
+class Scenario:
+    """A scenario containing multiple tasks to be completed sequentially."""
+    name: str
+    tasks: list[Task]
+@dataclass
+class Personality:
+    """A personality definition for the simulated user."""
+    name: str
+    description: str

ragbits-evaluate 0.5.0__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl

ragbits-evaluate 0.5.0py3-none-any.whl → 1.4.0.dev202602030301py3-none-any.whl