PyPI - ragbits-evaluate - Versions diffs - 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl - Mend

ragbits-evaluate 0.0.30rc1py3-none-any.whl → 1.4.0.dev202602030301py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

ragbits/evaluate/agent_simulation/__init__.py +4 -49
ragbits/evaluate/agent_simulation/conversation.py +278 -663
ragbits/evaluate/agent_simulation/logger.py +1 -1
ragbits/evaluate/agent_simulation/metrics/__init__.py +0 -10
ragbits/evaluate/agent_simulation/metrics/builtin.py +49 -59
ragbits/evaluate/agent_simulation/metrics/collectors.py +17 -37
ragbits/evaluate/agent_simulation/models.py +18 -198
ragbits/evaluate/agent_simulation/results.py +49 -125
ragbits/evaluate/agent_simulation/scenarios.py +19 -95
ragbits/evaluate/agent_simulation/simulation.py +166 -72
ragbits/evaluate/metrics/question_answer.py +25 -8
{ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA +2 -6
{ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/RECORD +14 -25
ragbits/evaluate/agent_simulation/checkers.py +0 -591
ragbits/evaluate/agent_simulation/display.py +0 -118
ragbits/evaluate/agent_simulation/metrics/deepeval.py +0 -295
ragbits/evaluate/agent_simulation/tracing.py +0 -233
ragbits/evaluate/api.py +0 -603
ragbits/evaluate/api_types.py +0 -343
ragbits/evaluate/execution_manager.py +0 -451
ragbits/evaluate/stores/__init__.py +0 -36
ragbits/evaluate/stores/base.py +0 -98
ragbits/evaluate/stores/file.py +0 -466
ragbits/evaluate/stores/kv.py +0 -535
{ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL +0 -0

ragbits/evaluate/agent_simulation/display.py DELETED Viewed

@@ -1,118 +0,0 @@
-"""Rich display components for agent simulation."""
-from rich.console import Console
-from rich.live import Live
-from rich.panel import Panel
-from rich.text import Text
-from ragbits.evaluate.agent_simulation.models import Scenario
-def display_scenario(scenario: Scenario, console: Console | None = None) -> None:
-    """Display scenario with rich panel.
-    Args:
-        scenario: The scenario to display.
-        console: Optional Rich console instance. If not provided, a new one is created.
-    """
-    if console is None:
-        console = Console()
-    console.print(_build_panel(scenario))
-def _build_panel(
-    scenario: Scenario,
-    current_task_idx: int | None = None,
-    task_status: dict[int, str] | None = None,
-    metrics: dict[str, str | int | float] | None = None,
-) -> Panel:
-    """Build a rich panel for the scenario.
-    Args:
-        scenario: The scenario to display.
-        current_task_idx: Index of currently running task (for live display).
-        task_status: Dict mapping task index to status emoji/text.
-        metrics: Optional metrics to display at the bottom.
-    Returns:
-        Rich Panel object.
-    """
-    lines = Text()
-    for i, task in enumerate(scenario.tasks):
-        # Status indicator
-        if task_status and i in task_status:
-            status = task_status[i]
-        elif current_task_idx is not None and i == current_task_idx:
-            status = "▶"
-        elif current_task_idx is not None and i < current_task_idx:
-            status = "✓"
-        else:
-            status = " "
-        style = "bold" if current_task_idx == i else ""
-        lines.append(f"{status} {i + 1}. {task.task}\n", style=style)
-        # Show checker configuration summary
-        if task.checkers:
-            lines.append(f"     → {task.get_checker_summary()}\n", style="green")
-    if metrics:
-        lines.append("\n")
-        for key, value in metrics.items():
-            lines.append(f"{key}: {value}  ", style="cyan")
-    title = scenario.name
-    if scenario.group:
-        title += f" [dim]({scenario.group})[/dim]"
-    return Panel(lines, title=title, border_style="blue")
-class ScenarioLiveDisplay:
-    """Live display for scenario execution with real-time updates."""
-    def __init__(self, scenario: Scenario, console: Console | None = None) -> None:
-        self.scenario = scenario
-        self.console = console or Console()
-        self.current_task_idx: int | None = None
-        self.task_status: dict[int, str] = {}
-        self.metrics: dict[str, str | int | float] = {}
-        self._live: Live | None = None
-    def __enter__(self) -> "ScenarioLiveDisplay":
-        self._live = Live(self._render(), console=self.console, refresh_per_second=4)
-        self._live.__enter__()
-        return self
-    def __exit__(self, *args: object) -> None:
-        if self._live:
-            self._live.__exit__(*args)
-    def _render(self) -> Panel:
-        return _build_panel(
-            self.scenario,
-            current_task_idx=self.current_task_idx,
-            task_status=self.task_status,
-            metrics=self.metrics,
-        )
-    def update(self) -> None:
-        """Refresh the display."""
-        if self._live:
-            self._live.update(self._render())
-    def set_current_task(self, idx: int) -> None:
-        """Set the currently running task index."""
-        self.current_task_idx = idx
-        self.update()
-    def mark_task_done(self, idx: int, success: bool = True) -> None:
-        """Mark a task as completed."""
-        self.task_status[idx] = "✓" if success else "✗"
-        self.update()
-    def set_metric(self, key: str, value: str | int | float) -> None:
-        """Update a metric value."""
-        self.metrics[key] = value
-        self.update()

ragbits/evaluate/agent_simulation/metrics/deepeval.py DELETED Viewed

@@ -1,295 +0,0 @@
-"""DeepEval metric collectors following the MetricCollector protocol."""
-from __future__ import annotations
-from typing import TYPE_CHECKING, Any
-from ragbits.evaluate.agent_simulation.metrics.collectors import MetricCollector
-if TYPE_CHECKING:
-    from ragbits.evaluate.agent_simulation.results import TurnResult
-class DeepEvalCompletenessMetricCollector(MetricCollector):
-    """Tracks conversation completeness using DeepEval's ConversationCompletenessMetric.
-    Evaluates how well the assistant addresses the user's requests throughout
-    the conversation.
-    Example:
-        >>> result = await run_simulation(
-        ...     scenario=scenario,
-        ...     chat=chat,
-        ...     config=SimulationConfig(metrics=[DeepEvalCompletenessMetricCollector]),
-        ... )
-        >>> print(result.metrics.custom.get("deepeval_completeness"))
-    """
-    def __init__(self) -> None:
-        """Initialize the completeness metric collector."""
-        self._turns: list[tuple[str, str]] = []  # (user, assistant) pairs
-    def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
-        """No-op for DeepEval completeness collector.
-        Args:
-            turn_index: 1-based index of the current turn.
-            task_index: 0-based index of the current task.
-            user_message: The user message (stored in on_turn_end).
-        """
-        pass
-    def on_turn_end(self, turn_result: TurnResult) -> None:
-        """Record the turn for later evaluation.
-        Args:
-            turn_result: The result of the completed turn.
-        """
-        self._turns.append((turn_result.user_message, turn_result.assistant_message))
-    def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
-        """Evaluate conversation completeness using DeepEval.
-        Args:
-            all_turns: List of all turn results.
-        Returns:
-            Dictionary with deepeval_completeness score and reason.
-        """
-        if not self._turns:
-            return {}
-        try:
-            from deepeval.metrics import ConversationCompletenessMetric  # type: ignore[attr-defined]
-            from deepeval.test_case import ConversationalTestCase, LLMTestCase  # type: ignore[attr-defined]
-            deepeval_turns = [LLMTestCase(input=user, actual_output=assistant) for user, assistant in self._turns]
-            test_case = ConversationalTestCase(turns=deepeval_turns)
-            metric = ConversationCompletenessMetric()
-            metric.measure(test_case)
-            return {
-                "deepeval_completeness": metric.score,
-                "deepeval_completeness_reason": getattr(metric, "reason", None),
-            }
-        except Exception as e:
-            return {
-                "deepeval_completeness": None,
-                "deepeval_completeness_error": str(e),
-            }
-    def reset(self) -> None:
-        """Reset collector state for a new conversation."""
-        self._turns = []
-class DeepEvalRelevancyMetricCollector(MetricCollector):
-    """Tracks conversation relevancy using DeepEval's ConversationRelevancyMetric.
-    Evaluates how relevant the assistant's responses are to the user's queries.
-    Example:
-        >>> result = await run_simulation(
-        ...     scenario=scenario,
-        ...     chat=chat,
-        ...     config=SimulationConfig(metrics=[DeepEvalRelevancyMetricCollector]),
-        ... )
-        >>> print(result.metrics.custom.get("deepeval_relevancy"))
-    """
-    def __init__(self) -> None:
-        """Initialize the relevancy metric collector."""
-        self._turns: list[tuple[str, str]] = []
-    def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
-        """No-op for DeepEval relevancy collector.
-        Args:
-            turn_index: 1-based index of the current turn.
-            task_index: 0-based index of the current task.
-            user_message: The user message (stored in on_turn_end).
-        """
-        pass
-    def on_turn_end(self, turn_result: TurnResult) -> None:
-        """Record the turn for later evaluation.
-        Args:
-            turn_result: The result of the completed turn.
-        """
-        self._turns.append((turn_result.user_message, turn_result.assistant_message))
-    def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
-        """Evaluate conversation relevancy using DeepEval.
-        Args:
-            all_turns: List of all turn results.
-        Returns:
-            Dictionary with deepeval_relevancy score and reason.
-        """
-        if not self._turns:
-            return {}
-        try:
-            from deepeval.metrics import ConversationRelevancyMetric  # type: ignore[attr-defined]
-            from deepeval.test_case import ConversationalTestCase, LLMTestCase  # type: ignore[attr-defined]
-            deepeval_turns = [LLMTestCase(input=user, actual_output=assistant) for user, assistant in self._turns]
-            test_case = ConversationalTestCase(turns=deepeval_turns)
-            metric = ConversationRelevancyMetric()
-            metric.measure(test_case)
-            return {
-                "deepeval_relevancy": metric.score,
-                "deepeval_relevancy_reason": getattr(metric, "reason", None),
-            }
-        except Exception as e:
-            return {
-                "deepeval_relevancy": None,
-                "deepeval_relevancy_error": str(e),
-            }
-    def reset(self) -> None:
-        """Reset collector state for a new conversation."""
-        self._turns = []
-class DeepEvalKnowledgeRetentionMetricCollector(MetricCollector):
-    """Tracks knowledge retention using DeepEval's KnowledgeRetentionMetric.
-    Evaluates how well the assistant retains and uses information from earlier
-    in the conversation.
-    Example:
-        >>> result = await run_simulation(
-        ...     scenario=scenario,
-        ...     chat=chat,
-        ...     config=SimulationConfig(metrics=[DeepEvalKnowledgeRetentionMetricCollector]),
-        ... )
-        >>> print(result.metrics.custom.get("deepeval_knowledge_retention"))
-    """
-    def __init__(self) -> None:
-        """Initialize the knowledge retention metric collector."""
-        self._turns: list[tuple[str, str]] = []
-    def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
-        """No-op for DeepEval knowledge retention collector.
-        Args:
-            turn_index: 1-based index of the current turn.
-            task_index: 0-based index of the current task.
-            user_message: The user message (stored in on_turn_end).
-        """
-        pass
-    def on_turn_end(self, turn_result: TurnResult) -> None:
-        """Record the turn for later evaluation.
-        Args:
-            turn_result: The result of the completed turn.
-        """
-        self._turns.append((turn_result.user_message, turn_result.assistant_message))
-    def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
-        """Evaluate knowledge retention using DeepEval.
-        Args:
-            all_turns: List of all turn results.
-        Returns:
-            Dictionary with deepeval_knowledge_retention score and reason.
-        """
-        if not self._turns:
-            return {}
-        try:
-            from deepeval.metrics import KnowledgeRetentionMetric  # type: ignore[attr-defined]
-            from deepeval.test_case import ConversationalTestCase, LLMTestCase  # type: ignore[attr-defined]
-            deepeval_turns = [LLMTestCase(input=user, actual_output=assistant) for user, assistant in self._turns]
-            test_case = ConversationalTestCase(turns=deepeval_turns)
-            metric = KnowledgeRetentionMetric()
-            metric.measure(test_case)
-            return {
-                "deepeval_knowledge_retention": metric.score,
-                "deepeval_knowledge_retention_reason": getattr(metric, "reason", None),
-            }
-        except Exception as e:
-            return {
-                "deepeval_knowledge_retention": None,
-                "deepeval_knowledge_retention_error": str(e),
-            }
-    def reset(self) -> None:
-        """Reset collector state for a new conversation."""
-        self._turns = []
-class DeepEvalAllMetricsCollector(MetricCollector):
-    """Composite collector that evaluates all DeepEval conversation metrics.
-    Runs all three DeepEval metrics (completeness, relevancy, knowledge retention)
-    at the end of the conversation.
-    Example:
-        >>> result = await run_simulation(
-        ...     scenario=scenario,
-        ...     chat=chat,
-        ...     config=SimulationConfig(metrics=[DeepEvalAllMetricsCollector]),
-        ... )
-        >>> print(result.metrics.custom.get("deepeval_completeness"))
-        >>> print(result.metrics.custom.get("deepeval_relevancy"))
-        >>> print(result.metrics.custom.get("deepeval_knowledge_retention"))
-    """
-    def __init__(self) -> None:
-        """Initialize the all-metrics collector."""
-        self._completeness = DeepEvalCompletenessMetricCollector()
-        self._relevancy = DeepEvalRelevancyMetricCollector()
-        self._knowledge_retention = DeepEvalKnowledgeRetentionMetricCollector()
-    def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
-        """Delegate to all child collectors.
-        Args:
-            turn_index: 1-based index of the current turn.
-            task_index: 0-based index of the current task.
-            user_message: The user message.
-        """
-        self._completeness.on_turn_start(turn_index, task_index, user_message)
-        self._relevancy.on_turn_start(turn_index, task_index, user_message)
-        self._knowledge_retention.on_turn_start(turn_index, task_index, user_message)
-    def on_turn_end(self, turn_result: TurnResult) -> None:
-        """Delegate to all child collectors.
-        Args:
-            turn_result: The result of the completed turn.
-        """
-        self._completeness.on_turn_end(turn_result)
-        self._relevancy.on_turn_end(turn_result)
-        self._knowledge_retention.on_turn_end(turn_result)
-    def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
-        """Aggregate metrics from all child collectors.
-        Args:
-            all_turns: List of all turn results.
-        Returns:
-            Dictionary combining all DeepEval metrics.
-        """
-        combined: dict[str, Any] = {}
-        combined.update(self._completeness.on_conversation_end(all_turns))
-        combined.update(self._relevancy.on_conversation_end(all_turns))
-        combined.update(self._knowledge_retention.on_conversation_end(all_turns))
-        return combined
-    def reset(self) -> None:
-        """Reset all child collectors."""
-        self._completeness.reset()
-        self._relevancy.reset()
-        self._knowledge_retention.reset()

ragbits/evaluate/agent_simulation/tracing.py DELETED Viewed

@@ -1,233 +0,0 @@
-"""Tracing utilities for agent simulation.
-Provides context managers and analyzers for capturing and analyzing
-LLM calls, tool invocations, and token usage during simulation runs.
-"""
-from collections.abc import Iterator
-from contextlib import contextmanager
-from contextvars import Token
-from dataclasses import dataclass
-from typing import Any
-from ragbits.agents.tool import ToolCallResult
-from ragbits.core.audit.traces import MemoryTraceHandler, set_trace_handlers
-from ragbits.core.audit.traces.memory import TraceSpan, _TraceSession, _current_session
-from ragbits.core.llms import Usage
-from ragbits.core.llms.base import UsageItem
-__all__ = [
-    "LLMCall",
-    "MemoryTraceHandler",
-    "TraceAnalyzer",
-    "TraceSpan",
-    "collect_traces",
-]
-@dataclass
-class LLMCall:
-    """Represents a single LLM call extracted from traces."""
-    model: str
-    prompt_tokens: int
-    completion_tokens: int
-    total_tokens: int
-    duration_ms: float | None = None
-@contextmanager
-def collect_traces(simulation_id: str | None = None) -> Iterator[MemoryTraceHandler]:
-    """Context manager for collecting traces during a simulation.
-    Sets up a context-local trace session and registers a MemoryTraceHandler
-    to capture all traced operations within the context.
-    Args:
-        simulation_id: Optional identifier for the simulation run.
-    Yields:
-        MemoryTraceHandler instance that captures traces for this context.
-    Example:
-        with collect_traces(simulation_id="sim-123") as handler:
-            # Run simulation code here
-            traces = handler.get_traces()
-    """
-    # Create a new session for this context
-    session = _TraceSession()
-    token: Token[_TraceSession | None] = _current_session.set(session)
-    # Create and register the handler
-    handler = MemoryTraceHandler()
-    set_trace_handlers(handler)
-    try:
-        yield handler
-    finally:
-        # Restore previous session state
-        _current_session.reset(token)
-class TraceAnalyzer:
-    """Analyzes trace spans to extract tool calls and usage information.
-    This class processes trace data collected by MemoryTraceHandler to
-    provide structured access to tool invocations and token usage metrics.
-    """
-    def __init__(self, traces: list[dict[str, Any]]) -> None:
-        """Initialize the analyzer with trace data.
-        Args:
-            traces: List of trace span dictionaries from MemoryTraceHandler.get_traces().
-        """
-        self._traces = traces
-        self._spans = [TraceSpan.from_dict(t) for t in traces]
-    @classmethod
-    def from_traces(cls, traces: list[dict[str, Any]]) -> "TraceAnalyzer":
-        """Create an analyzer from trace dictionaries.
-        Args:
-            traces: List of trace span dictionaries.
-        Returns:
-            A new TraceAnalyzer instance.
-        """
-        return cls(traces)
-    def get_tool_calls(self) -> list[ToolCallResult]:
-        """Extract all tool call results from the traces.
-        Searches through all spans (including nested children) for tool
-        invocation traces and extracts the tool call information.
-        Returns:
-            List of ToolCallResult objects representing all tool calls.
-        """
-        tool_calls: list[ToolCallResult] = []
-        self._extract_tool_calls_recursive(self._spans, tool_calls)
-        return tool_calls
-    def _extract_tool_calls_recursive(self, spans: list[TraceSpan], results: list[ToolCallResult]) -> None:
-        """Recursively extract tool calls from spans and their children.
-        Args:
-            spans: List of spans to process.
-            results: List to append found tool calls to.
-        """
-        for span in spans:
-            # Check if this span represents a tool call
-            if self._is_tool_call_span(span):
-                tool_result = self._extract_tool_call(span)
-                if tool_result:
-                    results.append(tool_result)
-            # Recurse into children
-            if span.children:
-                self._extract_tool_calls_recursive(span.children, results)
-    def _is_tool_call_span(self, span: TraceSpan) -> bool:
-        """Check if a span represents a tool call.
-        Args:
-            span: The span to check.
-        Returns:
-            True if the span is a tool call, False otherwise.
-        """
-        # Tool calls typically have names like "Tool.call" or contain tool-related info
-        name = span.name.lower()
-        return "tool" in name and ("call" in name or "execute" in name or "invoke" in name)
-    def _extract_tool_call(self, span: TraceSpan) -> ToolCallResult | None:
-        """Extract a ToolCallResult from a tool call span.
-        Args:
-            span: The tool call span.
-        Returns:
-            ToolCallResult if extraction succeeds, None otherwise.
-        """
-        inputs = span.inputs
-        outputs = span.outputs
-        # Try to extract tool call info from span data
-        tool_name = inputs.get("name", inputs.get("tool_name", span.name))
-        tool_id = inputs.get("id", inputs.get("tool_id", ""))
-        arguments = inputs.get("arguments", inputs.get("args", {}))
-        result = outputs.get("result", outputs.get("returned", None))
-        if isinstance(tool_name, str):
-            return ToolCallResult(
-                id=str(tool_id) if tool_id else "",
-                name=tool_name,
-                arguments=arguments if isinstance(arguments, dict) else {},
-                result=result,
-            )
-        return None
-    def get_usage(self) -> Usage:
-        """Extract aggregated token usage from the traces.
-        Searches through all spans for LLM call traces and aggregates
-        the token usage information.
-        Returns:
-            Usage object with aggregated token usage across all LLM calls.
-        """
-        usage_items: list[UsageItem] = []
-        self._extract_usage_recursive(self._spans, usage_items)
-        return Usage(requests=usage_items)
-    def _extract_usage_recursive(self, spans: list[TraceSpan], results: list[UsageItem]) -> None:
-        """Recursively extract usage info from spans and their children.
-        Args:
-            spans: List of spans to process.
-            results: List to append found usage items to.
-        """
-        for span in spans:
-            # Check if this span has usage information
-            usage_item = self._extract_usage_from_span(span)
-            if usage_item:
-                results.append(usage_item)
-            # Recurse into children
-            if span.children:
-                self._extract_usage_recursive(span.children, results)
-    def _extract_usage_from_span(self, span: TraceSpan) -> UsageItem | None:
-        """Extract a UsageItem from a span if it contains usage data.
-        Args:
-            span: The span to extract usage from.
-        Returns:
-            UsageItem if extraction succeeds, None otherwise.
-        """
-        outputs = span.outputs
-        # Check for usage in outputs
-        usage_data = outputs.get("usage", None)
-        if isinstance(usage_data, dict):
-            return UsageItem(
-                model=usage_data.get("model", "unknown"),
-                prompt_tokens=usage_data.get("prompt_tokens", 0),
-                completion_tokens=usage_data.get("completion_tokens", 0),
-                total_tokens=usage_data.get("total_tokens", 0),
-                estimated_cost=usage_data.get("estimated_cost", 0.0),
-            )
-        # Check for direct token counts in outputs
-        if "prompt_tokens" in outputs or "completion_tokens" in outputs:
-            return UsageItem(
-                model=outputs.get("model", span.inputs.get("model", "unknown")),
-                prompt_tokens=outputs.get("prompt_tokens", 0),
-                completion_tokens=outputs.get("completion_tokens", 0),
-                total_tokens=outputs.get("total_tokens", 0),
-                estimated_cost=outputs.get("estimated_cost", 0.0),
-            )
-        return None

ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl

ragbits-evaluate 0.0.30rc1py3-none-any.whl → 1.4.0.dev202602030301py3-none-any.whl