PyPI - ragbits-evaluate - Versions diffs - 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl - Mend

ragbits-evaluate 0.0.30rc1py3-none-any.whl → 1.4.0.dev202602030301py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

ragbits/evaluate/agent_simulation/__init__.py +4 -49
ragbits/evaluate/agent_simulation/conversation.py +278 -663
ragbits/evaluate/agent_simulation/logger.py +1 -1
ragbits/evaluate/agent_simulation/metrics/__init__.py +0 -10
ragbits/evaluate/agent_simulation/metrics/builtin.py +49 -59
ragbits/evaluate/agent_simulation/metrics/collectors.py +17 -37
ragbits/evaluate/agent_simulation/models.py +18 -198
ragbits/evaluate/agent_simulation/results.py +49 -125
ragbits/evaluate/agent_simulation/scenarios.py +19 -95
ragbits/evaluate/agent_simulation/simulation.py +166 -72
ragbits/evaluate/metrics/question_answer.py +25 -8
{ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA +2 -6
{ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/RECORD +14 -25
ragbits/evaluate/agent_simulation/checkers.py +0 -591
ragbits/evaluate/agent_simulation/display.py +0 -118
ragbits/evaluate/agent_simulation/metrics/deepeval.py +0 -295
ragbits/evaluate/agent_simulation/tracing.py +0 -233
ragbits/evaluate/api.py +0 -603
ragbits/evaluate/api_types.py +0 -343
ragbits/evaluate/execution_manager.py +0 -451
ragbits/evaluate/stores/__init__.py +0 -36
ragbits/evaluate/stores/base.py +0 -98
ragbits/evaluate/stores/file.py +0 -466
ragbits/evaluate/stores/kv.py +0 -535
{ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL +0 -0

ragbits/evaluate/agent_simulation/results.py CHANGED Viewed

@@ -5,8 +5,6 @@ from datetime import datetime
 from enum import Enum
 from typing import Any
-from ragbits.core.llms.base import Usage
 class SimulationStatus(str, Enum):
     """Status of a simulation run."""
@@ -17,60 +15,6 @@ class SimulationStatus(str, Enum):
     TIMEOUT = "timeout"
-@dataclass
-class CheckerResultItem:
-    """Result of a single checker evaluation."""
-    type: str
-    completed: bool
-    reason: str
-    def to_dict(self) -> dict[str, Any]:
-        """Convert to dictionary."""
-        return {"type": self.type, "completed": self.completed, "reason": self.reason}
-    @classmethod
-    def from_dict(cls, data: dict[str, Any]) -> "CheckerResultItem":
-        """Create from dictionary."""
-        return cls(
-            type=data.get("type", data.get("checker_type", "unknown")),
-            completed=data.get("completed", False),
-            reason=data.get("reason", ""),
-        )
-@dataclass
-class ResponseChunk:
-    """A response chunk from the ChatInterface stream."""
-    turn_index: int
-    task_index: int
-    chunk_index: int
-    chunk_type: str
-    chunk_data: dict[str, Any]
-    def to_dict(self) -> dict[str, Any]:
-        """Convert to dictionary."""
-        return {
-            "turn_index": self.turn_index,
-            "task_index": self.task_index,
-            "chunk_index": self.chunk_index,
-            "chunk_type": self.chunk_type,
-            "chunk_data": self.chunk_data,
-        }
-    @classmethod
-    def from_dict(cls, data: dict[str, Any]) -> "ResponseChunk":
-        """Create from dictionary."""
-        return cls(
-            turn_index=data.get("turn_index", 0),
-            task_index=data.get("task_index", 0),
-            chunk_index=data.get("chunk_index", 0),
-            chunk_type=data.get("chunk_type", "unknown"),
-            chunk_data=data.get("chunk_data", {}),
-        )
 @dataclass
 class TurnResult:
     """Result of a single conversation turn."""
@@ -82,10 +26,8 @@ class TurnResult:
     tool_calls: list[dict[str, Any]] = field(default_factory=list)
     task_completed: bool = False
     task_completed_reason: str = ""
-    token_usage: Usage = field(default_factory=Usage)
+    token_usage: dict[str, int] | None = None
     latency_ms: float | None = None
-    checkers: list[CheckerResultItem] = field(default_factory=list)
-    checker_mode: str = "all"
 @dataclass
@@ -94,51 +36,32 @@ class TaskResult:
     task_index: int
     description: str
+    expected_result: str | None
     completed: bool
     turns_taken: int
     final_reason: str
-    checkers: list[dict[str, Any]] = field(default_factory=list)
-    checker_mode: str = "all"
 @dataclass
 class ConversationMetrics:
-    """Aggregate metrics for the conversation.
-    All metrics are stored in a single flat dictionary. Built-in metrics include:
-    - total_turns: Number of conversation turns
-    - total_tasks: Number of tasks in the scenario
-    - tasks_completed: Number of successfully completed tasks
-    - success_rate: Ratio of completed tasks
-    - total_tokens, prompt_tokens, completion_tokens: Token usage
-    - total_cost_usd: Estimated cost
-    - latency_avg_ms, latency_min_ms, latency_max_ms: Response latency
-    - tools_total_calls, tools_unique, tools_counts: Tool usage
-    Additional metrics from custom collectors are merged into this dict.
-    """
-    metrics: dict[str, Any] = field(default_factory=dict)
-    @property
-    def total_turns(self) -> int:
-        """Number of conversation turns."""
-        return self.metrics.get("total_turns", 0)
-    @property
-    def total_tasks(self) -> int:
-        """Number of tasks in scenario."""
-        return self.metrics.get("total_tasks", 0)
-    @property
-    def tasks_completed(self) -> int:
-        """Number of completed tasks."""
-        return self.metrics.get("tasks_completed", 0)
+    """Aggregate metrics for the conversation."""
+    total_turns: int
+    total_tasks: int
+    tasks_completed: int
+    total_tokens: int = 0
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    total_cost_usd: float = 0.0
+    deepeval_scores: dict[str, float] = field(default_factory=dict)
+    custom: dict[str, Any] = field(default_factory=dict)
     @property
     def success_rate(self) -> float:
         """Calculate task success rate."""
-        return self.metrics.get("success_rate", 0.0)
+        if self.total_tasks == 0:
+            return 0.0
+        return self.tasks_completed / self.total_tasks
 @dataclass
@@ -154,23 +77,15 @@ class SimulationResult:
     turns: list[TurnResult] = field(default_factory=list)
     tasks: list[TaskResult] = field(default_factory=list)
     metrics: ConversationMetrics | None = None
-    response_chunks: list[ResponseChunk] = field(default_factory=list)
     # Optional metadata
     end_time: datetime | None = None
     agent_model: str | None = None
     simulated_user_model: str | None = None
     checker_model: str | None = None
-    persona: str | None = None
+    personality: str | None = None
     error: str | None = None
-    # Conversation context
-    conversation_id: str | None = None
-    final_state: dict[str, Any] = field(default_factory=dict)
-    # Traces from the chat interface
-    traces: list[dict[str, Any]] = field(default_factory=list)
     def to_dict(self) -> dict[str, Any]:
         """Convert to JSON-serializable dictionary."""
         return {
@@ -181,11 +96,8 @@ class SimulationResult:
             "agent_model": self.agent_model,
             "simulated_user_model": self.simulated_user_model,
             "checker_model": self.checker_model,
-            "persona": self.persona,
+            "personality": self.personality,
             "error": self.error,
-            "conversation_id": self.conversation_id,
-            "final_state": self.final_state,
-            "response_chunks": [c.to_dict() for c in self.response_chunks],
             "turns": [
                 {
                     "turn_index": t.turn_index,
@@ -195,10 +107,8 @@ class SimulationResult:
                     "tool_calls": t.tool_calls,
                     "task_completed": t.task_completed,
                     "task_completed_reason": t.task_completed_reason,
-                    "token_usage": t.token_usage.model_dump() if t.token_usage else None,
+                    "token_usage": t.token_usage,
                     "latency_ms": t.latency_ms,
-                    "checkers": [c.to_dict() for c in t.checkers],
-                    "checker_mode": t.checker_mode,
                 }
                 for t in self.turns
             ],
@@ -206,16 +116,27 @@ class SimulationResult:
                 {
                     "task_index": t.task_index,
                     "description": t.description,
+                    "expected_result": t.expected_result,
                     "completed": t.completed,
                     "turns_taken": t.turns_taken,
                     "final_reason": t.final_reason,
-                    "checkers": t.checkers,
-                    "checker_mode": t.checker_mode,
                 }
                 for t in self.tasks
             ],
-            "metrics": self.metrics.metrics if self.metrics else None,
-            "traces": self.traces,
+            "metrics": {
+                "total_turns": self.metrics.total_turns,
+                "total_tasks": self.metrics.total_tasks,
+                "tasks_completed": self.metrics.tasks_completed,
+                "success_rate": self.metrics.success_rate,
+                "total_tokens": self.metrics.total_tokens,
+                "prompt_tokens": self.metrics.prompt_tokens,
+                "completion_tokens": self.metrics.completion_tokens,
+                "total_cost_usd": self.metrics.total_cost_usd,
+                "deepeval_scores": self.metrics.deepeval_scores,
+                "custom": self.metrics.custom,
+            }
+            if self.metrics
+            else None,
         }
     @classmethod
@@ -232,8 +153,6 @@ class SimulationResult:
                 task_completed_reason=t.get("task_completed_reason", ""),
                 token_usage=t.get("token_usage"),
                 latency_ms=t.get("latency_ms"),
-                checkers=[CheckerResultItem.from_dict(c) for c in t.get("checkers", [])],
-                checker_mode=t.get("checker_mode", "all"),
             )
             for t in data.get("turns", [])
         ]
@@ -242,19 +161,28 @@ class SimulationResult:
             TaskResult(
                 task_index=t["task_index"],
                 description=t["description"],
+                expected_result=t.get("expected_result"),
                 completed=t["completed"],
                 turns_taken=t["turns_taken"],
                 final_reason=t["final_reason"],
-                checkers=t.get("checkers", []),
-                checker_mode=t.get("checker_mode", "all"),
             )
             for t in data.get("tasks", [])
         ]
         metrics_data = data.get("metrics")
-        metrics = ConversationMetrics(metrics=metrics_data) if metrics_data else None
-        response_chunks = [ResponseChunk.from_dict(c) for c in data.get("response_chunks", [])]
+        metrics = None
+        if metrics_data:
+            metrics = ConversationMetrics(
+                total_turns=metrics_data["total_turns"],
+                total_tasks=metrics_data["total_tasks"],
+                tasks_completed=metrics_data["tasks_completed"],
+                total_tokens=metrics_data.get("total_tokens", 0),
+                prompt_tokens=metrics_data.get("prompt_tokens", 0),
+                completion_tokens=metrics_data.get("completion_tokens", 0),
+                total_cost_usd=metrics_data.get("total_cost_usd", 0.0),
+                deepeval_scores=metrics_data.get("deepeval_scores", {}),
+                custom=metrics_data.get("custom", {}),
+            )
         return cls(
             scenario_name=data["scenario_name"],
@@ -264,13 +192,9 @@ class SimulationResult:
             agent_model=data.get("agent_model"),
             simulated_user_model=data.get("simulated_user_model"),
             checker_model=data.get("checker_model"),
-            persona=data.get("persona", data.get("personality")),  # backwards compat
+            personality=data.get("personality"),
             error=data.get("error"),
-            conversation_id=data.get("conversation_id"),
-            final_state=data.get("final_state", {}),
             turns=turns,
             tasks=tasks,
             metrics=metrics,
-            response_chunks=response_chunks,
-            traces=data.get("traces", []),
         )

ragbits/evaluate/agent_simulation/scenarios.py CHANGED Viewed

@@ -1,52 +1,26 @@
 """Scenario loading functionality for agent simulation."""
 import json
-from dataclasses import dataclass, field
 from pathlib import Path
 from ragbits.evaluate.agent_simulation.models import Personality, Scenario, Task
-@dataclass
-class ScenarioFile:
-    """Represents a loaded scenario file with its metadata."""
-    filename: str
-    group: str | None
-    scenarios: list[Scenario] = field(default_factory=list)
 def load_scenarios(scenarios_file: str = "scenarios.json") -> list[Scenario]:
     """Load scenarios from a JSON file.
-    Expected JSON format (new format with file-level group):
-    {
-      "group": "Group Name",
-      "scenarios": [
-        {
-          "name": "Scenario 1",
-          "tasks": [
-            {
-              "task": "task description",
-              "checkers": [
-                {"type": "llm", "expected_result": "expected result"},
-                {"type": "tool_call", "tools": ["tool1", "tool2"]},
-                {"type": "state", "checks": [{"key": "user.confirmed", "value": true}]}
-              ],
-              "checker_mode": "all"
-            },
-            ...
-          ]
-        },
-        ...
-      ]
-    }
-    Legacy format (array of scenarios) is still supported:
+    Expected JSON format:
     [
       {
         "name": "Scenario 1",
-        "tasks": [...]
+        "tasks": [
+          {
+            "task": "task description",
+            "expected_result": "expected result description",
+            "expected_tools": ["tool1", "tool2"]  # optional
+          },
+          ...
+        ]
       },
       ...
     ]
@@ -57,35 +31,6 @@ def load_scenarios(scenarios_file: str = "scenarios.json") -> list[Scenario]:
     Returns:
         List of Scenario objects
-    Raises:
-        FileNotFoundError: If the scenarios file doesn't exist
-        ValueError: If the file format is invalid
-    """
-    scenario_file = load_scenario_file(scenarios_file)
-    return scenario_file.scenarios
-def load_scenario_file(scenarios_file: str = "scenarios.json") -> ScenarioFile:
-    """Load scenarios from a JSON file with file-level metadata.
-    This function supports both the new format with file-level group:
-    {
-      "group": "Group Name",
-      "scenarios": [...]
-    }
-    And the legacy format (array of scenarios):
-    [
-      {"name": "Scenario 1", "tasks": [...]},
-      ...
-    ]
-    Args:
-        scenarios_file: Path to the JSON file containing scenarios
-    Returns:
-        ScenarioFile object containing scenarios and file-level metadata
     Raises:
         FileNotFoundError: If the scenarios file doesn't exist
         ValueError: If the file format is invalid
@@ -97,31 +42,16 @@ def load_scenario_file(scenarios_file: str = "scenarios.json") -> ScenarioFile:
     with scenarios_path.open("r", encoding="utf-8") as f:
         data = json.load(f)
-    # Determine format and extract scenarios data and file-level group
-    file_group: str | None = None
-    scenarios_data: list
-    if isinstance(data, dict):
-        # New format: {"group": "...", "scenarios": [...]}
-        file_group = data.get("group")
-        scenarios_data = data.get("scenarios", [])
-        if not isinstance(scenarios_data, list):
-            raise ValueError(f"'scenarios' field must be a JSON array, got {type(scenarios_data).__name__}")
-    elif isinstance(data, list):
-        # Legacy format: [...]
-        scenarios_data = data
-    else:
-        raise ValueError(f"Scenarios file must contain a JSON object or array, got {type(data).__name__}")
+    if not isinstance(data, list):
+        raise ValueError(f"Scenarios file must contain a JSON array, got {type(data).__name__}")
     scenarios: list[Scenario] = []
-    for scenario_data in scenarios_data:
+    for scenario_data in data:
         if not isinstance(scenario_data, dict):
             raise ValueError(f"Each scenario must be a JSON object, got {type(scenario_data).__name__}")
         name = scenario_data.get("name", "")
         tasks_data = scenario_data.get("tasks", [])
-        # Scenario can have its own group, or inherit from file-level group
-        scenario_group = scenario_data.get("group") or file_group
         if not isinstance(tasks_data, list):
             raise ValueError(f"Tasks must be a JSON array, got {type(tasks_data).__name__}")
@@ -132,24 +62,18 @@ def load_scenario_file(scenarios_file: str = "scenarios.json") -> ScenarioFile:
                 raise ValueError(f"Each task must be a JSON object, got {type(task_data).__name__}")
             task_desc = task_data.get("task", "")
-            checkers = task_data.get("checkers", [])
-            checker_mode = task_data.get("checker_mode", "all")
-            if not isinstance(checkers, list):
-                raise ValueError(f"checkers must be a list, got {type(checkers).__name__}")
-            tasks.append(Task(task=task_desc, checkers=checkers, checker_mode=checker_mode))
+            expected_result = task_data.get("expected_result", "")
+            expected_tools = task_data.get("expected_tools")
+            if expected_tools is not None and not isinstance(expected_tools, list):
+                raise ValueError(f"expected_tools must be a list or null, got {type(expected_tools).__name__}")
+            tasks.append(Task(task=task_desc, expected_result=expected_result, expected_tools=expected_tools))
-        scenarios.append(Scenario(name=name, tasks=tasks, group=scenario_group))
+        scenarios.append(Scenario(name=name, tasks=tasks))
     if not scenarios:
         raise ValueError(f"No scenarios found in {scenarios_path}")
-    return ScenarioFile(
-        filename=scenarios_path.name,
-        group=file_group,
-        scenarios=scenarios,
-    )
+    return scenarios
 def load_personalities(personalities_file: str = "personalities.json") -> list[Personality]:

ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl

ragbits-evaluate 0.0.30rc1py3-none-any.whl → 1.4.0.dev202602030301py3-none-any.whl