PyPI - ragbits-evaluate - Versions diffs - 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl - Mend

ragbits-evaluate 0.0.30rc1py3-none-any.whl → 1.4.0.dev202602030301py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

ragbits/evaluate/agent_simulation/__init__.py +4 -49
ragbits/evaluate/agent_simulation/conversation.py +278 -663
ragbits/evaluate/agent_simulation/logger.py +1 -1
ragbits/evaluate/agent_simulation/metrics/__init__.py +0 -10
ragbits/evaluate/agent_simulation/metrics/builtin.py +49 -59
ragbits/evaluate/agent_simulation/metrics/collectors.py +17 -37
ragbits/evaluate/agent_simulation/models.py +18 -198
ragbits/evaluate/agent_simulation/results.py +49 -125
ragbits/evaluate/agent_simulation/scenarios.py +19 -95
ragbits/evaluate/agent_simulation/simulation.py +166 -72
ragbits/evaluate/metrics/question_answer.py +25 -8
{ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA +2 -6
{ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/RECORD +14 -25
ragbits/evaluate/agent_simulation/checkers.py +0 -591
ragbits/evaluate/agent_simulation/display.py +0 -118
ragbits/evaluate/agent_simulation/metrics/deepeval.py +0 -295
ragbits/evaluate/agent_simulation/tracing.py +0 -233
ragbits/evaluate/api.py +0 -603
ragbits/evaluate/api_types.py +0 -343
ragbits/evaluate/execution_manager.py +0 -451
ragbits/evaluate/stores/__init__.py +0 -36
ragbits/evaluate/stores/base.py +0 -98
ragbits/evaluate/stores/file.py +0 -466
ragbits/evaluate/stores/kv.py +0 -535
{ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL +0 -0

ragbits/evaluate/agent_simulation/logger.py CHANGED Viewed

@@ -40,7 +40,7 @@ class ConversationLogger:
             f.write(f"Tasks: {len(scenario.tasks)}\n")
             for i, task in enumerate(scenario.tasks, 1):
                 f.write(f"  Task {i}: {task.task}\n")
-                f.write(f"    Checkers: {task.checkers.model_dump_json()}\n")
+                f.write(f"    Expected: {task.expected_result}\n")
             f.write(f"Agent model: {agent_model_name or 'default'}\n")
             f.write(f"Simulated user model: {sim_user_model_name or 'default'}\n")
             f.write(f"Goal checker model: {checker_model_name or 'default'}\n")

ragbits/evaluate/agent_simulation/metrics/__init__.py CHANGED Viewed

@@ -9,19 +9,9 @@ from ragbits.evaluate.agent_simulation.metrics.collectors import (
     CompositeMetricCollector,
     MetricCollector,
 )
-from ragbits.evaluate.agent_simulation.metrics.deepeval import (
-    DeepEvalAllMetricsCollector,
-    DeepEvalCompletenessMetricCollector,
-    DeepEvalKnowledgeRetentionMetricCollector,
-    DeepEvalRelevancyMetricCollector,
-)
 __all__ = [
     "CompositeMetricCollector",
-    "DeepEvalAllMetricsCollector",
-    "DeepEvalCompletenessMetricCollector",
-    "DeepEvalKnowledgeRetentionMetricCollector",
-    "DeepEvalRelevancyMetricCollector",
     "LatencyMetricCollector",
     "MetricCollector",
     "TokenUsageMetricCollector",

ragbits/evaluate/agent_simulation/metrics/builtin.py CHANGED Viewed

@@ -3,28 +3,24 @@
 from __future__ import annotations
 import time
-from functools import reduce
 from typing import TYPE_CHECKING, Any
-from ragbits.chat.interface.types import ChatResponseUnion, TextResponse
-from ragbits.core.llms.base import Usage
-from ragbits.evaluate.agent_simulation.metrics.collectors import MetricCollector
 if TYPE_CHECKING:
     from ragbits.evaluate.agent_simulation.results import TurnResult
-class LatencyMetricCollector(MetricCollector):
+class LatencyMetricCollector:
     """Tracks response latency per turn.
     Measures the wall-clock time from turn start to turn end,
     providing average, min, max, and per-turn latency metrics.
     Example:
+        >>> collector = LatencyMetricCollector()
         >>> result = await run_simulation(
         ...     scenario=scenario,
         ...     chat=chat,
-        ...     config=SimulationConfig(metrics=[LatencyMetricCollector]),
+        ...     metric_collectors=[collector],
         ... )
         >>> print(result.metrics.custom["latency_avg_ms"])
     """
@@ -33,7 +29,6 @@ class LatencyMetricCollector(MetricCollector):
         """Initialize the latency collector."""
         self._turn_start: float | None = None
         self._latencies: list[float] = []
-        self._times_to_first_token: dict[int, float] = {}
     def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
         """Record the start time for this turn.
@@ -45,20 +40,6 @@ class LatencyMetricCollector(MetricCollector):
         """
         self._turn_start = time.perf_counter()
-    def on_streamed_response(
-        self, turn_index: int, task_index: int, user_message: str, response: ChatResponseUnion
-    ) -> None:
-        """Record time to first token on first text response.
-        Args:
-            turn_index: 1-based index of the current turn.
-            task_index: 0-based index of the current task.
-            user_message: The user message (unused).
-            response: Response chunk from chat interface.
-        """
-        if turn_index not in self._times_to_first_token and isinstance(response, TextResponse):
-            self._times_to_first_token[turn_index] = (time.perf_counter() - self._turn_start) * 1000
     def on_turn_end(self, turn_result: TurnResult) -> None:
         """Calculate and store the latency for this turn.
@@ -80,47 +61,41 @@ class LatencyMetricCollector(MetricCollector):
             Dictionary with latency_avg_ms, latency_max_ms, latency_min_ms,
             and latency_per_turn_ms.
         """
-        ttfts = list(self._times_to_first_token.values())
-        rv = {}
-        if self._latencies:
-            rv.update(
-                {
-                    "latency_avg_ms": sum(self._latencies) / len(self._latencies),
-                    "latency_max_ms": max(self._latencies),
-                    "latency_min_ms": min(self._latencies),
-                }
-            )
-        if ttfts:
-            rv.update(
-                {
-                    "time_to_first_token_avg_ms": sum(ttfts) / len(ttfts),
-                    "time_to_first_token_max_ms": max(ttfts),
-                    "time_to_first_token_min_ms": min(ttfts),
-                }
-            )
-        return rv
-class TokenUsageMetricCollector(MetricCollector):
+        if not self._latencies:
+            return {}
+        return {
+            "latency_avg_ms": sum(self._latencies) / len(self._latencies),
+            "latency_max_ms": max(self._latencies),
+            "latency_min_ms": min(self._latencies),
+            "latency_per_turn_ms": self._latencies.copy(),
+        }
+    def reset(self) -> None:
+        """Reset collector state for a new conversation."""
+        self._turn_start = None
+        self._latencies = []
+class TokenUsageMetricCollector:
     """Tracks token usage and estimated cost per turn.
     Aggregates token counts from each turn to provide total and
     per-turn token usage statistics.
     Example:
+        >>> collector = TokenUsageMetricCollector()
         >>> result = await run_simulation(
         ...     scenario=scenario,
         ...     chat=chat,
-        ...     config=SimulationConfig(metrics=[TokenUsageMetricCollector]),
+        ...     metric_collectors=[collector],
         ... )
         >>> print(result.metrics.custom["tokens_total"])
     """
     def __init__(self) -> None:
         """Initialize the token usage collector."""
-        self._usage: dict[int, Usage] = {}
+        self._turn_tokens: list[dict[str, int]] = []
     def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
         """No-op for token collector.
@@ -138,7 +113,10 @@ class TokenUsageMetricCollector(MetricCollector):
         Args:
             turn_result: The result of the completed turn.
         """
-        self._usage[turn_result.turn_index] = turn_result.token_usage
+        if turn_result.token_usage:
+            self._turn_tokens.append(turn_result.token_usage.copy())
+        else:
+            self._turn_tokens.append({"total": 0, "prompt": 0, "completion": 0})
     def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
         """Return aggregated token usage metrics.
@@ -148,33 +126,40 @@ class TokenUsageMetricCollector(MetricCollector):
         Returns:
             Dictionary with tokens_total, tokens_prompt, tokens_completion,
-            tokens_avg_per_turn, and estimated_usd.
+            tokens_avg_per_turn, and tokens_per_turn.
         """
-        if not self._usage:
+        if not self._turn_tokens:
             return {}
-        total_usage = reduce(lambda a, b: a + b, self._usage.values())
+        total = sum(t.get("total", 0) for t in self._turn_tokens)
+        prompt = sum(t.get("prompt", 0) for t in self._turn_tokens)
+        completion = sum(t.get("completion", 0) for t in self._turn_tokens)
         return {
-            "tokens_total": total_usage.total_tokens,
-            "tokens_prompt": total_usage.prompt_tokens,
-            "tokens_completion": total_usage.completion_tokens,
-            "tokens_avg_per_turn": total_usage.total_tokens / len(self._usage),
-            "estimated_usd": total_usage.estimated_cost,
+            "tokens_total": total,
+            "tokens_prompt": prompt,
+            "tokens_completion": completion,
+            "tokens_avg_per_turn": total / len(self._turn_tokens) if self._turn_tokens else 0,
+            "tokens_per_turn": [t.get("total", 0) for t in self._turn_tokens],
         }
+    def reset(self) -> None:
+        """Reset collector state for a new conversation."""
+        self._turn_tokens = []
-class ToolUsageMetricCollector(MetricCollector):
+class ToolUsageMetricCollector:
     """Tracks tool call patterns during the conversation.
     Records which tools were called, how often, and on which turns,
     providing insights into agent behavior.
     Example:
+        >>> collector = ToolUsageMetricCollector()
         >>> result = await run_simulation(
         ...     scenario=scenario,
         ...     chat=chat,
-        ...     config=SimulationConfig(metrics=[ToolUsageMetricCollector]),
+        ...     metric_collectors=[collector],
         ... )
         >>> print(result.metrics.custom["tools_unique"])
     """
@@ -229,3 +214,8 @@ class ToolUsageMetricCollector(MetricCollector):
             "tools_per_turn": self._tool_calls.copy(),
             "turns_with_tools": turns_with_tools,
         }
+    def reset(self) -> None:
+        """Reset collector state for a new conversation."""
+        self._tool_calls = []
+        self._tool_counts = {}

ragbits/evaluate/agent_simulation/metrics/collectors.py CHANGED Viewed

@@ -2,16 +2,14 @@
 from __future__ import annotations
-from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any
-from ragbits.chat.interface.types import ChatResponseUnion
+from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
 if TYPE_CHECKING:
     from ragbits.evaluate.agent_simulation.results import TurnResult
-class MetricCollector(ABC):
+@runtime_checkable
+class MetricCollector(Protocol):
     """Protocol for collecting metrics during conversation simulation.
     Implement this protocol to create custom metric collectors that can
@@ -33,7 +31,7 @@ class MetricCollector(ABC):
         ...         pass
     """
-    def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:  # noqa: PLR6301
+    def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
         """Called before agent processes a turn.
         Args:
@@ -41,30 +39,16 @@ class MetricCollector(ABC):
             task_index: 0-based index of the current task.
             user_message: The user message being sent to the agent.
         """
-        return
-    def on_streamed_response(  # noqa: PLR6301
-        self, turn_index: int, task_index: int, user_message: str, response: ChatResponseUnion
-    ) -> None:
-        """Called after receiving chunk from chat interface.
-        Args:
-            turn_index: 1-based index of the current turn.
-            task_index: 0-based index of the current task.
-            user_message: The user message being sent to the agent.
-            response: Response yielded from chat, usually command or text chunk.
-        """
-        return
+        ...
-    def on_turn_end(self, turn_result: TurnResult) -> None:  # noqa: PLR6301
+    def on_turn_end(self, turn_result: TurnResult) -> None:
         """Called after a turn completes.
         Args:
             turn_result: The result of the completed turn.
         """
-        return
+        ...
-    @abstractmethod
     def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
         """Called when the conversation ends, returns computed metrics.
@@ -74,6 +58,11 @@ class MetricCollector(ABC):
         Returns:
             Dictionary of metric names to values.
         """
+        ...
+    def reset(self) -> None:
+        """Reset collector state for a new conversation."""
+        ...
 class CompositeMetricCollector:
@@ -123,20 +112,6 @@ class CompositeMetricCollector:
         for collector in self._collectors:
             collector.on_turn_start(turn_index, task_index, user_message)
-    def on_streamed_response(
-        self, turn_index: int, task_index: int, user_message: str, response: ChatResponseUnion
-    ) -> None:
-        """Delegate to all child collectors.
-        Args:
-            turn_index: 1-based index of the current turn.
-            task_index: 0-based index of the current task.
-            user_message: The user message being sent to the agent.
-            response: Response yielded from chat, usually command or text chunk.
-        """
-        for collector in self._collectors:
-            collector.on_streamed_response(turn_index, task_index, user_message, response)
     def on_turn_end(self, turn_result: TurnResult) -> None:
         """Delegate to all child collectors.
@@ -160,3 +135,8 @@ class CompositeMetricCollector:
             metrics = collector.on_conversation_end(all_turns)
             combined.update(metrics)
         return combined
+    def reset(self) -> None:
+        """Reset all child collectors."""
+        for collector in self._collectors:
+            collector.reset()

ragbits/evaluate/agent_simulation/models.py CHANGED Viewed

@@ -1,217 +1,37 @@
 """Data models for agent simulation scenarios."""
-from __future__ import annotations
+from dataclasses import dataclass
-from collections.abc import Callable
-from typing import TYPE_CHECKING, Any, Literal
-from pydantic import BaseModel, Field, create_model
-from ragbits.evaluate.agent_simulation.context import DataSnapshot, DomainContext
-from ragbits.evaluate.agent_simulation.metrics.collectors import MetricCollector
-if TYPE_CHECKING:
-    from rich.console import Console
-    from ragbits.evaluate.agent_simulation.checkers import BaseCheckerConfig
-    from ragbits.evaluate.agent_simulation.display import ScenarioLiveDisplay
-class Turn(BaseModel):
+@dataclass
+class Turn:
     """A single conversation turn between user and assistant."""
     user: str
     assistant: str
-class Task(BaseModel):
-    """A singular task or goal that simulated user is destined to complete."""
-    task: str = Field(
-        ...,
-        description="A natural language description of the objective that simulated user needs to complete.",
-    )
-    checkers: list[dict[str, Any]] = Field(
-        default_factory=list,
-        description="List of checker configurations. Each dict must have 'type' key and checker-specific fields.",
-    )
-    checker_mode: Literal["all", "any"] = Field(
-        default="all",
-        description="How to combine multiple checkers: 'all' (all must pass), 'any' (one must pass)",
-    )
-    def get_parsed_checkers(self) -> list[BaseCheckerConfig]:
-        """Parse checker configs into typed checker instances.
+@dataclass
+class Task:
+    """A single task with its expected result."""
-        Returns:
-            List of parsed checker config instances.
-        """
-        from ragbits.evaluate.agent_simulation.checkers import parse_checker_config
+    task: str
+    expected_result: str
+    expected_tools: list[str] | None = None
+    """Optional list of tool names that should be used to complete this task."""
-        return [parse_checker_config(c) for c in self.checkers]
-    def get_checker_summary(self) -> str:
-        """Get a human-readable summary of configured checkers.
-        Returns:
-            Summary string describing the checkers.
-        """
-        if not self.checkers:
-            return "no checkers"
-        types = [c.get("type", "unknown") for c in self.checkers]
-        return f"{', '.join(types)} ({self.checker_mode})"
-class Scenario(BaseModel):
+@dataclass
+class Scenario:
     """A scenario containing multiple tasks to be completed sequentially."""
-    name: str = Field(..., description="Short name identyfing the scenario")
-    tasks: list[Task] = Field(
-        default_factory=list,
-        description=(
-            "List of tasks that will be executed during the scenario. "
-            "Simulating LLM will use this list to determine next steps. "
-            "It can be both treated as conversation outline or a checklist "
-            "that should be realized by simulated user. "
-            "Expected result will be used to judge if specific exchange of messages "
-            "was aligned with system expectactions. "
-        ),
-    )
-    turn_limit: int | None = Field(
-        None,
-        description=(
-            "Limit how many turns can be ran before failing the scenario. "
-            "If set here it will override default settings."
-        ),
-    )
-    turn_limit_per_task: int | None = Field(
-        None,
-        description="Limit number of turns, this time per task. Specific tasks can override their limits.",
-    )
-    group: str | None = Field(
-        None,
-        description=(
-            "Scenarios may be coupled together by being in the same group. "
-            "Scenarios in groups are often executed one after another, "
-            "may have some sort of dependencies or inference. "
-            "In final results aggregated group metrics can be found."
-        ),
-    )
+    name: str
+    tasks: list[Task]
-    def display(self, console: Console | None = None) -> None:
-        """Display scenario with rich panel."""
-        from ragbits.evaluate.agent_simulation.display import display_scenario
-        display_scenario(self, console)
-    def live_display(self, console: Console | None = None) -> ScenarioLiveDisplay:
-        """Create a live display for this scenario."""
-        from ragbits.evaluate.agent_simulation.display import ScenarioLiveDisplay
-        return ScenarioLiveDisplay(self, console)
-    @classmethod
-    def dto(cls) -> type[Scenario]:
-        """Create a DTO class for serialization."""
-        if not hasattr(cls, "_dto_cls"):
-            cls._dto_cls = create_model(
-                "ScenarioDTO",
-                __base__=cls,
-                name=(str, cls.__pydantic_fields__["name"]),
-                tasks=(list[Task], cls.__pydantic_fields__["tasks"]),
-            )
-        return cls._dto_cls
-class Personality(BaseModel):
+@dataclass
+class Personality:
     """A personality definition for the simulated user."""
-    name: str = Field(
-        ...,
-        description="A descriptive name that will help to identify this specific instance of personality.",
-    )
-    description: str = Field(
-        ...,
-        description=(
-            "Detailed description of user behaviour, style of communication, "
-            "internal motives, language, attitute, etc."
-        ),
-    )
-class SimulationConfig(BaseModel):
-    """Configuration for running agent simulations.
-    Groups parameters that are commonly passed between simulation components.
-    Excludes instance-specific objects like ChatInterface, callbacks, and streams.
-    """
-    model_config = {"arbitrary_types_allowed": True}
-    max_turns_scenario: int = Field(
-        default=15,
-        description="Maximum number of conversation turns for the entire scenario.",
-    )
-    max_turns_task: int | None = Field(
-        default=4,
-        description="Maximum number of conversation turns per task (None for no limit).",
-    )
-    log_file: str | None = Field(
-        default=None,
-        description="Optional path to log file.",
-    )
-    agent_model_name: str | None = Field(
-        default=None,
-        description="Optional override for agent LLM model name.",
-    )
-    sim_user_model_name: str | None = Field(
-        default=None,
-        description="Optional override for simulated user LLM model name.",
-    )
-    checker_model_name: str | None = Field(
-        default=None,
-        description="Optional override for goal checker LLM model name.",
-    )
-    default_model: str = Field(
-        default="gpt-4o-mini",
-        description="Default LLM model name when specific models not provided.",
-    )
-    api_key: str = Field(
-        default="",
-        description="API key for LLM.",
-    )
-    user_message_prefix: str = Field(
-        default="",
-        description="Optional prefix to add to user messages before sending to agent.",
-    )
-    domain_context: DomainContext | None = Field(
-        default=None,
-        description="Optional domain context for goal checking (currency, locale, business rules).",
-    )
-    data_snapshot: DataSnapshot | None = Field(
-        default=None,
-        description="Optional data snapshot to ground simulated user requests to available data.",
-    )
-    metrics: list[type[MetricCollector] | Callable[[], MetricCollector]] | None = Field(
-        default=None,
-        description=(
-            "Optional list of metric collector factories. Each item can be either a class "
-            "(e.g., LatencyMetricCollector) or a callable that returns a collector instance "
-            "(e.g., lambda: CustomCollector(arg=value)). Fresh instances are created for each run."
-        ),
-    )
-    def create_metric_collectors(self) -> list[MetricCollector]:
-        """Create fresh metric collector instances for a simulation run.
-        Each call creates new instances to ensure concurrent runs don't share state.
-        Returns:
-            List of freshly instantiated metric collectors.
-        """
-        if not self.metrics:
-            return []
-        return [factory() for factory in self.metrics]
+    name: str
+    description: str

ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl

ragbits-evaluate 0.0.30rc1py3-none-any.whl → 1.4.0.dev202602030301py3-none-any.whl