PyPI - agenteval-py - Versions diffs - 0.1.0__py3-none-any.whl - Mend

agenteval-py 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

agenteval/__init__.py +46 -0
agenteval/adapters/__init__.py +9 -0
agenteval/adapters/anthropic_adapter.py +80 -0
agenteval/adapters/langchain_adapter.py +135 -0
agenteval/adapters/openai_adapter.py +80 -0
agenteval/assertions.py +289 -0
agenteval/cli.py +93 -0
agenteval/exceptions.py +17 -0
agenteval/models.py +123 -0
agenteval/py.typed +0 -0
agenteval/registry.py +99 -0
agenteval/reporter.py +139 -0
agenteval/runner.py +119 -0
agenteval/suite.py +181 -0
agenteval/tracer.py +303 -0
agenteval_py-0.1.0.dist-info/METADATA +561 -0
agenteval_py-0.1.0.dist-info/RECORD +20 -0
agenteval_py-0.1.0.dist-info/WHEEL +4 -0
agenteval_py-0.1.0.dist-info/entry_points.txt +2 -0
agenteval_py-0.1.0.dist-info/licenses/LICENSE +21 -0

agenteval/__init__.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""agenteval — evaluation toolkit for LLM agents.
+Quick start::
+    import agenteval
+    @agenteval.test(n=20, threshold=0.8)
+    async def test_my_agent(tracer: agenteval.Tracer) -> None:
+        search = tracer.wrap(my_search_tool)
+        async with tracer.run(input="find Python tutorials") as run:
+            result = await my_agent("find Python tutorials", search=search)
+            run.set_output(result)
+        tracer.assert_that().called_tool("my_search_tool").no_errors().check()
+    # Run a single test directly:
+    result = agenteval.run(test_my_agent, n=10)
+    # Discover and run all @agenteval.test functions in a directory:
+    suite = agenteval.run_suite("tests/")
+"""
+from agenteval import adapters
+from agenteval.assertions import AssertionSet
+from agenteval.models import AgentTrace, SuiteResult, TestResult, ToolCall
+from agenteval.registry import test
+from agenteval.reporter import RichReporter
+from agenteval.runner import run
+from agenteval.suite import run_suite
+from agenteval.tracer import Tracer
+__version__ = "0.1.0"
+__all__ = [
+    "AgentTrace",
+    "AssertionSet",
+    "RichReporter",
+    "SuiteResult",
+    "TestResult",
+    "ToolCall",
+    "Tracer",
+    "adapters",
+    "run",
+    "run_suite",
+    "test",
+]

agenteval/adapters/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Framework adapters for agenteval.
+Each adapter makes it easy to instrument a specific framework without
+changing agent code. Available adapters:
+- ``agenteval.adapters.openai`` — OpenAI function calling
+- ``agenteval.adapters.anthropic`` — Anthropic tool use
+- ``agenteval.adapters.langchain`` — LangChain callback handler
+"""

agenteval/adapters/anthropic_adapter.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""Anthropic tool use adapter for agenteval.
+Usage::
+    from agenteval.adapters.anthropic_adapter import wrap_tools, extract_token_usage
+    async def test_my_agent(tracer: Tracer) -> None:
+        tools = wrap_tools({"web_search": search_fn, "calculator": calc_fn}, tracer)
+        async with tracer.run(input=prompt) as run:
+            messages = [{"role": "user", "content": prompt}]
+            while True:
+                response = await client.messages.create(
+                    model="claude-sonnet-4-6",
+                    max_tokens=1024,
+                    tools=anthropic_tool_schemas,
+                    messages=messages,
+                )
+                run.set_token_usage(extract_token_usage(response))
+                if response.stop_reason == "tool_use":
+                    for block in response.content:
+                        if block.type == "tool_use":
+                            result = await tools[block.name](**block.input)
+                            # append tool result to messages ...
+                elif response.stop_reason == "end_turn":
+                    text = next(
+                        (b.text for b in response.content if b.type == "text"), ""
+                    )
+                    run.set_output(text)
+                    break
+        tracer.assert_that().called_tool("web_search").no_errors().check()
+"""
+from __future__ import annotations
+from typing import Any, Callable, Optional
+from agenteval.tracer import Tracer
+def wrap_tools(
+    tool_functions: dict[str, Callable[..., Any]],
+    tracer: Tracer,
+) -> dict[str, Callable[..., Any]]:
+    """Wrap a dict of Anthropic tool functions with the tracer.
+    Args:
+        tool_functions: Mapping of tool name → callable.
+        tracer: The active Tracer for the current test run.
+    Returns:
+        New dict with the same keys but wrapped callables that record
+        calls, timing, and errors into the tracer.
+    """
+    return {name: tracer.wrap(fn, name=name) for name, fn in tool_functions.items()}
+def extract_token_usage(response: Any) -> Optional[dict[str, int]]:
+    """Extract token usage from an Anthropic Message response.
+    Args:
+        response: An anthropic.types.Message object.
+    Returns:
+        Dict with input_tokens, output_tokens, or None if unavailable.
+    """
+    usage = getattr(response, "usage", None)
+    if usage is None:
+        return None
+    result: dict[str, int] = {}
+    if hasattr(usage, "input_tokens"):
+        result["input_tokens"] = usage.input_tokens
+    if hasattr(usage, "output_tokens"):
+        result["output_tokens"] = usage.output_tokens
+    if hasattr(usage, "cache_read_input_tokens"):
+        result["cache_read_input_tokens"] = usage.cache_read_input_tokens
+    if hasattr(usage, "cache_creation_input_tokens"):
+        result["cache_creation_input_tokens"] = usage.cache_creation_input_tokens
+    return result or None

agenteval/adapters/langchain_adapter.py ADDED Viewed

@@ -0,0 +1,135 @@
+"""LangChain callback handler adapter for agenteval.
+Usage::
+    from agenteval.adapters.langchain_adapter import AgentEvalCallbackHandler
+    async def test_langchain_agent(tracer: Tracer) -> None:
+        handler = AgentEvalCallbackHandler()
+        async with tracer.run(input="find Italian restaurants") as run:
+            result = await agent.ainvoke(
+                {"input": "find Italian restaurants"},
+                config={"callbacks": [handler]},
+            )
+            run.set_output(result.get("output", ""))
+        tracer.assert_that().called_tool("restaurant_search").no_errors().check()
+The handler reads the active Tracer from the _ACTIVE_TRACER ContextVar, so it
+works automatically when used inside agenteval.run() — no explicit tracer
+reference needed.
+"""
+from __future__ import annotations
+import time
+from typing import Any
+from uuid import UUID
+from agenteval.tracer import Tracer
+try:
+    from langchain_core.callbacks.base import BaseCallbackHandler
+    from langchain_core.outputs import LLMResult
+    _LANGCHAIN_AVAILABLE = True
+except ImportError:
+    _LANGCHAIN_AVAILABLE = False
+    BaseCallbackHandler = object  # type: ignore[assignment,misc]
+    LLMResult = Any  # type: ignore[assignment,misc]
+class AgentEvalCallbackHandler(BaseCallbackHandler):  # type: ignore[misc]
+    """LangChain callback handler that records tool calls into the active Tracer.
+    Records each tool invocation's name, arguments, result, duration, and any
+    errors. Reads the active tracer via ``Tracer.current()`` (ContextVar), so
+    multiple concurrent test runs each get their own tracer automatically.
+    If no tracer is active (i.e., used outside of agenteval.run()), all
+    callbacks are no-ops to avoid errors in non-test contexts.
+    """
+    def __init__(self) -> None:
+        if not _LANGCHAIN_AVAILABLE:
+            raise ImportError(
+                "langchain-core is required for AgentEvalCallbackHandler. "
+                "Install it with: pip install agenteval[langchain]"
+            )
+        super().__init__()
+        # Maps LangChain run_id → (start_time, tool_name, parsed_args)
+        self._pending: dict[str, tuple[float, str, dict[str, Any]]] = {}
+    def on_tool_start(
+        self,
+        serialized: dict[str, Any],
+        input_str: str,
+        *,
+        run_id: UUID,
+        **kwargs: Any,
+    ) -> None:
+        tool_name: str = serialized.get("name", kwargs.get("name", "unknown"))
+        try:
+            import json
+            args = json.loads(input_str) if isinstance(input_str, str) else {"input": input_str}
+            if not isinstance(args, dict):
+                args = {"input": args}
+        except Exception:
+            args = {"input": input_str}
+        self._pending[str(run_id)] = (time.perf_counter(), tool_name, args)
+    def on_tool_end(
+        self,
+        output: str,
+        *,
+        run_id: UUID,
+        **kwargs: Any,
+    ) -> None:
+        tracer = Tracer.current()
+        if tracer is None:
+            return
+        key = str(run_id)
+        entry = self._pending.pop(key, None)
+        if entry is None:
+            return
+        start_time, tool_name, args = entry
+        duration = time.perf_counter() - start_time
+        tracer.record_tool_call(
+            name=tool_name,
+            arguments=args,
+            result=output,
+            duration_seconds=duration,
+            timestamp=time.time() - duration,
+            error=None,
+        )
+    def on_tool_error(
+        self,
+        error: BaseException,
+        *,
+        run_id: UUID,
+        **kwargs: Any,
+    ) -> None:
+        tracer = Tracer.current()
+        if tracer is None:
+            return
+        key = str(run_id)
+        entry = self._pending.pop(key, None)
+        if entry is None:
+            return
+        start_time, tool_name, args = entry
+        duration = time.perf_counter() - start_time
+        tracer.record_tool_call(
+            name=tool_name,
+            arguments=args,
+            result=None,
+            duration_seconds=duration,
+            timestamp=time.time() - duration,
+            error=f"{type(error).__name__}: {error}",
+        )

agenteval/adapters/openai_adapter.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""OpenAI function calling adapter for agenteval.
+Usage::
+    from agenteval.adapters.openai_adapter import wrap_tools, extract_token_usage
+    async def test_my_agent(tracer: Tracer) -> None:
+        tools = wrap_tools({"search": search_fn, "calculator": calc_fn}, tracer)
+        async with tracer.run(input=prompt) as run:
+            # Your OpenAI tool-calling loop:
+            messages = [{"role": "user", "content": prompt}]
+            while True:
+                response = await client.chat.completions.create(
+                    model="gpt-4o",
+                    messages=messages,
+                    tools=openai_tool_schemas,
+                )
+                run.set_token_usage(extract_token_usage(response))
+                choice = response.choices[0]
+                if choice.finish_reason == "tool_calls":
+                    for tc in choice.message.tool_calls:
+                        import json
+                        fn_name = tc.function.name
+                        fn_args = json.loads(tc.function.arguments)
+                        result = await tools[fn_name](**fn_args)
+                        # append tool result to messages ...
+                elif choice.finish_reason == "stop":
+                    run.set_output(choice.message.content)
+                    break
+        tracer.assert_that().called_tool("search").no_errors().check()
+"""
+from __future__ import annotations
+from typing import Any, Callable, Optional
+from agenteval.tracer import Tracer
+def wrap_tools(
+    tool_functions: dict[str, Callable[..., Any]],
+    tracer: Tracer,
+) -> dict[str, Callable[..., Any]]:
+    """Wrap a dict of OpenAI tool functions with the tracer.
+    Args:
+        tool_functions: Mapping of tool name → callable.
+        tracer: The active Tracer for the current test run.
+    Returns:
+        New dict with the same keys but wrapped callables that record
+        calls, timing, and errors into the tracer.
+    Example::
+        tools = wrap_tools({"search": search_fn, "weather": weather_fn}, tracer)
+        result = await tools["search"](query="python news")
+    """
+    return {name: tracer.wrap(fn, name=name) for name, fn in tool_functions.items()}
+def extract_token_usage(response: Any) -> Optional[dict[str, int]]:
+    """Extract token usage from an OpenAI ChatCompletion response object.
+    Args:
+        response: An openai.types.chat.ChatCompletion object.
+    Returns:
+        Dict with prompt_tokens, completion_tokens, total_tokens, or None.
+    """
+    usage = getattr(response, "usage", None)
+    if usage is None:
+        return None
+    return {
+        "prompt_tokens": getattr(usage, "prompt_tokens", 0),
+        "completion_tokens": getattr(usage, "completion_tokens", 0),
+        "total_tokens": getattr(usage, "total_tokens", 0),
+    }

agenteval/assertions.py ADDED Viewed

@@ -0,0 +1,289 @@
+"""Fluent assertion library for inspecting AgentTrace objects."""
+from __future__ import annotations
+import json
+import math
+from typing import Any, Callable, Literal, Optional, Union
+from pydantic import BaseModel
+from agenteval.models import AgentTrace
+class AssertionSet:
+    """Fluent assertions on an AgentTrace.
+    Failures are **collected**, not raised immediately. Call `.check()` at the
+    end of a chain to raise a single AssertionError listing all failures.
+    Usage::
+        tracer.assert_that()
+            .called_tool("search")
+            .never_called_tool("delete")
+            .completed_within_steps(5)
+            .no_errors()
+            .check()
+    """
+    def __init__(self, trace: AgentTrace) -> None:
+        self._trace = trace
+        self._failures: list[str] = []
+    # ------------------------------------------------------------------ #
+    # Tool call assertions
+    # ------------------------------------------------------------------ #
+    def called_tool(self, name: str) -> "AssertionSet":
+        """Assert that the tool was called at least once."""
+        calls = [tc for tc in self._trace.tool_calls if tc.name == name]
+        if not calls:
+            all_tools = [tc.name for tc in self._trace.tool_calls]
+            self._failures.append(
+                f"Expected tool '{name}' to be called, but it was not. "
+                f"Tools called: {all_tools or '(none)'}"
+            )
+        return self
+    def never_called_tool(self, name: str) -> "AssertionSet":
+        """Assert that the tool was never called."""
+        calls = [tc for tc in self._trace.tool_calls if tc.name == name]
+        if calls:
+            self._failures.append(
+                f"Expected tool '{name}' to never be called, but it was called {len(calls)} time(s)."
+            )
+        return self
+    def tool_call_count(
+        self,
+        name: str,
+        *,
+        min: int = 0,
+        max: int = math.inf,  # type: ignore[assignment]
+    ) -> "AssertionSet":
+        """Assert that the tool was called between min and max times (inclusive)."""
+        count = sum(1 for tc in self._trace.tool_calls if tc.name == name)
+        if not (min <= count <= max):
+            self._failures.append(
+                f"Expected tool '{name}' to be called between {min} and "
+                f"{'∞' if max == math.inf else max} times, but it was called {count} time(s)."
+            )
+        return self
+    def tool_called_before(self, tool_a: str, tool_b: str) -> "AssertionSet":
+        """Assert that tool_a was called before tool_b (at least one call each)."""
+        calls = self._trace.tool_calls
+        first_a = next((i for i, tc in enumerate(calls) if tc.name == tool_a), None)
+        first_b = next((i for i, tc in enumerate(calls) if tc.name == tool_b), None)
+        if first_a is None:
+            self._failures.append(
+                f"Ordering assertion failed: tool '{tool_a}' was never called."
+            )
+        elif first_b is None:
+            self._failures.append(
+                f"Ordering assertion failed: tool '{tool_b}' was never called."
+            )
+        elif first_a >= first_b:
+            self._failures.append(
+                f"Expected '{tool_a}' to be called before '{tool_b}', "
+                f"but '{tool_b}' was called first (positions: {tool_a}={first_a}, {tool_b}={first_b})."
+            )
+        return self
+    def tool_called_with_args(
+        self,
+        name: str,
+        args: dict[str, Any],
+        *,
+        match: Literal["subset", "exact"] = "subset",
+    ) -> "AssertionSet":
+        """Assert that a tool was called with specific arguments.
+        Args:
+            name: Tool name to check.
+            args: Expected arguments.
+            match: 'subset' (default) checks all provided keys are present with
+                   matching values. 'exact' requires the arguments dict to match exactly.
+        """
+        matching_calls = [tc for tc in self._trace.tool_calls if tc.name == name]
+        if not matching_calls:
+            self._failures.append(
+                f"tool_called_with_args: tool '{name}' was never called."
+            )
+            return self
+        def _matches(call_args: dict[str, Any]) -> bool:
+            if match == "exact":
+                return call_args == args
+            # subset: all expected keys present with matching values
+            return all(call_args.get(k) == v for k, v in args.items())
+        if not any(_matches(tc.arguments) for tc in matching_calls):
+            actual_args = [tc.arguments for tc in matching_calls]
+            self._failures.append(
+                f"tool '{name}' was called {len(matching_calls)} time(s), but none matched "
+                f"the expected args {args} (match='{match}'). Actual args: {actual_args}"
+            )
+        return self
+    # ------------------------------------------------------------------ #
+    # Step / time assertions
+    # ------------------------------------------------------------------ #
+    def completed_within_steps(self, n: int) -> "AssertionSet":
+        """Assert that the agent finished in n steps or fewer."""
+        actual = self._trace.effective_steps
+        if actual > n:
+            self._failures.append(
+                f"Expected agent to complete within {n} steps, but took {actual} steps."
+            )
+        return self
+    def completed_within_seconds(self, n: float) -> "AssertionSet":
+        """Assert that the agent finished within n seconds."""
+        actual = self._trace.duration_seconds
+        if actual > n:
+            self._failures.append(
+                f"Expected agent to complete within {n:.2f}s, but took {actual:.2f}s."
+            )
+        return self
+    # ------------------------------------------------------------------ #
+    # Output assertions
+    # ------------------------------------------------------------------ #
+    def response_contains(self, keyword: str, *, case_sensitive: bool = True) -> "AssertionSet":
+        """Assert that the final response contains a keyword."""
+        output = self._trace.output
+        if output is None:
+            self._failures.append(
+                f"response_contains: agent output is None, expected to contain '{keyword}'."
+            )
+            return self
+        text = str(output)
+        haystack = text if case_sensitive else text.lower()
+        needle = keyword if case_sensitive else keyword.lower()
+        if needle not in haystack:
+            preview = text[:200] + "..." if len(text) > 200 else text
+            self._failures.append(
+                f"Expected response to contain '{keyword}', but it did not. "
+                f"Response: {preview!r}"
+            )
+        return self
+    def response_matches_schema(
+        self,
+        schema: type[BaseModel],
+        *,
+        parse_json: bool = True,
+    ) -> "AssertionSet":
+        """Assert that the final response matches a Pydantic schema.
+        If the output is a string and parse_json=True (default), it will be
+        JSON-parsed first before validation.
+        """
+        output = self._trace.output
+        if output is None:
+            self._failures.append(
+                f"response_matches_schema: agent output is None, "
+                f"expected to match {schema.__name__}."
+            )
+            return self
+        data: Any = output
+        if isinstance(output, str) and parse_json:
+            try:
+                data = json.loads(output)
+            except json.JSONDecodeError as e:
+                self._failures.append(
+                    f"response_matches_schema: failed to JSON-parse output before "
+                    f"validating against {schema.__name__}: {e}. "
+                    f"Output was: {output[:200]!r}"
+                )
+                return self
+        try:
+            schema.model_validate(data)
+        except Exception as e:
+            self._failures.append(
+                f"response_matches_schema: output does not match schema "
+                f"{schema.__name__}: {e}"
+            )
+        return self
+    # ------------------------------------------------------------------ #
+    # Error assertions
+    # ------------------------------------------------------------------ #
+    def no_errors(self) -> "AssertionSet":
+        """Assert that the agent completed without any exceptions."""
+        if self._trace.error is not None:
+            self._failures.append(
+                f"Expected no errors, but agent raised: {self._trace.error}"
+            )
+        return self
+    # ------------------------------------------------------------------ #
+    # Custom / escape hatch
+    # ------------------------------------------------------------------ #
+    def custom(
+        self,
+        fn: Callable[[AgentTrace], Union[bool, str]],
+        *,
+        message: Optional[str] = None,
+    ) -> "AssertionSet":
+        """Run a custom assertion function against the trace.
+        Args:
+            fn: Callable that receives the AgentTrace and returns True (pass),
+                False (fail), or a failure message string.
+            message: Optional failure message to use when fn returns False.
+        """
+        try:
+            result = fn(self._trace)
+        except Exception as e:
+            self._failures.append(
+                f"custom assertion raised an exception: {type(e).__name__}: {e}"
+            )
+            return self
+        if result is True:
+            return self
+        if result is False:
+            self._failures.append(
+                message or "custom assertion failed (returned False)."
+            )
+        elif isinstance(result, str):
+            self._failures.append(result)
+        return self
+    # ------------------------------------------------------------------ #
+    # Terminator
+    # ------------------------------------------------------------------ #
+    def check(self) -> None:
+        """Raise AssertionError listing all collected failures. No-op if all passed."""
+        if self._failures:
+            lines = "\n".join(f"  • {f}" for f in self._failures)
+            raise AssertionError(f"Trace assertions failed ({len(self._failures)} failure(s)):\n{lines}")
+    # ------------------------------------------------------------------ #
+    # Introspection (for use without raising)
+    # ------------------------------------------------------------------ #
+    @property
+    def passed(self) -> bool:
+        """True if no failures have been collected."""
+        return len(self._failures) == 0
+    @property
+    def failures(self) -> list[str]:
+        """List of all collected failure messages."""
+        return list(self._failures)