PyPI - agenttest-py - Versions diffs - 0.1.0__py3-none-any.whl - Mend

agenttest-py 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

agenttest/__init__.py +67 -0
agenttest/__main__.py +6 -0
agenttest/assertions.py +154 -0
agenttest/cli.py +249 -0
agenttest/config.py +123 -0
agenttest/judge.py +237 -0
agenttest/record.py +35 -0
agenttest/reporter.py +222 -0
agenttest/runner.py +228 -0
agenttest_py-0.1.0.dist-info/METADATA +174 -0
agenttest_py-0.1.0.dist-info/RECORD +14 -0
agenttest_py-0.1.0.dist-info/WHEEL +4 -0
agenttest_py-0.1.0.dist-info/entry_points.txt +2 -0
agenttest_py-0.1.0.dist-info/licenses/LICENSE +21 -0

agenttest/__init__.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""agenttest — The pytest of AI agents."""
+from __future__ import annotations
+from .record import record
+from .assertions import (
+    assert_compare,
+    assert_conciseness,
+    assert_contains_action,
+    assert_faithfulness,
+    assert_no_hallucination,
+    assert_no_toxicity,
+    assert_relevance,
+    assert_score,
+    assert_tone,
+)
+from .judge import Judge, judge
+from .reporter import load_last_report, report_results
+from .runner import discover_tests, run_tests
+__all__ = [
+    "eval",
+    "judge",
+    "record",
+    "EvalResult",
+    "Judge",
+    "assert_tone",
+    "assert_contains_action",
+    "assert_no_hallucination",
+    "assert_relevance",
+    "assert_no_toxicity",
+    "assert_faithfulness",
+    "assert_conciseness",
+    "assert_score",
+    "assert_compare",
+    "discover_tests",
+    "run_tests",
+    "load_last_report",
+    "report_results",
+]
+def eval(fn: object) -> object:
+    """
+    Decorator to mark a function as an agent eval test.
+    Discovered by agenttest run.
+    """
+    # Passthrough - runner discovers by parsing source for @eval
+    return fn
+class EvalResult:
+    """Result of a single eval run."""
+    def __init__(
+        self,
+        test_name: str,
+        status: str,
+        duration: float = 0,
+        error_message: str | None = None,
+        scores: dict[str, float] | None = None,
+    ):
+        self.test_name = test_name
+        self.status = status
+        self.duration = duration
+        self.error_message = error_message
+        self.scores = scores or {}

agenttest/__main__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Allow running as python -m agenttest."""
+from .cli import main
+if __name__ == "__main__":
+    main()

agenttest/assertions.py ADDED Viewed

@@ -0,0 +1,154 @@
+"""Assertion helpers for agent evaluations."""
+from __future__ import annotations
+from typing import Any
+from .judge import Judge, judge
+__all__ = [
+    "assert_tone",
+    "assert_contains_action",
+    "assert_no_hallucination",
+    "assert_relevance",
+    "assert_no_toxicity",
+    "assert_faithfulness",
+    "assert_conciseness",
+    "assert_score",
+    "assert_compare",
+]
+def _format_explanation(data: dict[str, Any]) -> str:
+    """Extract explanation from judge JSON response."""
+    for key in ("explanation", "reason"):
+        if key in data and data[key]:
+            return str(data[key])
+    return ""
+def assert_tone(
+    response: str,
+    expected: str,
+    judge_instance: Judge | None = None,
+) -> None:
+    """
+    Assert the response has the expected tone (empathetic, professional, aggressive, neutral).
+    """
+    j = judge_instance or judge
+    actual_tone = j.tone(response)
+    if actual_tone != expected:
+        raise AssertionError(
+            f"Expected tone '{expected}', got '{actual_tone}'"
+        )
+def assert_contains_action(
+    response: str,
+    action: str,
+    judge_instance: Judge | None = None,
+) -> None:
+    """Assert the response contains or implies the given action."""
+    j = judge_instance or judge
+    if not j.contains_action(response, action):
+        raise AssertionError(
+            f"Expected response to contain action '{action}', but it did not"
+        )
+def assert_no_hallucination(
+    response: str,
+    context: str | None = None,
+    judge_instance: Judge | None = None,
+) -> None:
+    """Assert the response contains no hallucinations."""
+    j = judge_instance or judge
+    if not j.no_hallucination(response, context):
+        raise AssertionError(
+            "Response appears to contain hallucinations (claims not supported by context)"
+        )
+def assert_relevance(
+    response: str,
+    query: str,
+    min_score: float = 0.7,
+    judge_instance: Judge | None = None,
+) -> None:
+    """Assert relevance score of response to query is at least min_score."""
+    j = judge_instance or judge
+    score = j.relevance(response, query)
+    if score < min_score:
+        raise AssertionError(
+            f"Relevance score {score:.2f} below threshold {min_score}"
+        )
+def assert_no_toxicity(
+    response: str,
+    judge_instance: Judge | None = None,
+) -> None:
+    """Assert the response is not toxic."""
+    j = judge_instance or judge
+    if j.toxicity(response):
+        raise AssertionError("Response contains toxic content")
+def assert_faithfulness(
+    response: str,
+    source: str,
+    min_score: float = 0.8,
+    judge_instance: Judge | None = None,
+) -> None:
+    """Assert faithfulness score of response to source is at least min_score."""
+    j = judge_instance or judge
+    score = j.faithfulness(response, source)
+    if score < min_score:
+        raise AssertionError(
+            f"Faithfulness score {score:.2f} below threshold {min_score}"
+        )
+def assert_conciseness(
+    response: str,
+    expected: str = "good",
+    judge_instance: Judge | None = None,
+) -> None:
+    """Assert conciseness is expected (too_short, good, too_long)."""
+    j = judge_instance or judge
+    actual = j.conciseness(response)
+    if actual != expected:
+        raise AssertionError(
+            f"Expected conciseness '{expected}', got '{actual}'"
+        )
+def assert_score(
+    response: str,
+    criteria: str,
+    min_score: float = 0.7,
+    judge_instance: Judge | None = None,
+) -> None:
+    """Assert custom score meets minimum."""
+    j = judge_instance or judge
+    score = j.score(response, criteria)
+    if score < min_score:
+        raise AssertionError(
+            f"Score {score:.2f} below threshold {min_score} for criteria: {criteria}"
+        )
+def assert_compare(
+    response_a: str,
+    response_b: str,
+    criteria: str,
+    expected_winner: str,
+    judge_instance: Judge | None = None,
+) -> None:
+    """Assert expected_winner ('a', 'b', or 'tie') wins the comparison."""
+    j = judge_instance or judge
+    winner = j.compare(response_a, response_b, criteria)
+    if winner != expected_winner:
+        raise AssertionError(
+            f"Expected winner '{expected_winner}', got '{winner}'"
+        )

agenttest/cli.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""CLI for agenttest."""
+from __future__ import annotations
+import json
+import sys
+from pathlib import Path
+from typing import Optional
+import click
+from rich.console import Console
+from rich.progress import Progress, SpinnerColumn, TextColumn
+from rich.panel import Panel
+from .config import load_config
+from .reporter import RESULTS_FILE, load_last_report, report_compare, report_diff, report_results
+from .runner import discover_tests, run_tests
+console = Console()
+def _results_path(tag: str | None = None) -> Path:
+    """Path for results file, optionally tagged."""
+    if tag:
+        return Path(f".agenttest_results_{tag}.json")
+    return RESULTS_FILE
+@click.group()
+@click.version_option(version="0.1.0")
+def app() -> None:
+    """The pytest of AI agents. Eval-driven testing for LLM applications."""
+    pass
+@app.command()
+@click.option("--path", "-p", default=".", type=click.Path(exists=True, file_okay=False, dir_okay=True))
+@click.option("--filter", "-k", "filter_pattern", default=None)
+@click.option("--workers", "-w", default=4, type=int)
+@click.option("--tag", "-t", default=None)
+def run(path: str, filter_pattern: Optional[str], workers: int, tag: Optional[str]) -> None:
+    """Discover and run all agent eval tests."""
+    from .config import get_api_key
+    path_obj = Path(path)
+    config = load_config(path_obj)
+    try:
+        get_api_key(config)
+    except ValueError as e:
+        console.print(str(e), style="red")
+        sys.exit(1)
+    timeout = config.get("timeout_seconds", 30)
+    fail_threshold = config.get("fail_threshold", 0.8)
+    tests = discover_tests(path_obj, filter_pattern)
+    if not tests:
+        console.print(Panel(
+            "No tests found. Create files matching agent_test_*.py or *_agent_test.py with @eval decorated functions.",
+            title="[yellow]No Tests[/yellow]",
+            border_style="yellow",
+        ))
+        sys.exit(1)
+    console.print(f"[cyan]Running {len(tests)} test(s)...[/cyan]\n")
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        console=console,
+    ) as progress:
+        task = progress.add_task("Executing evals...", total=len(tests))
+        results = run_tests(path_obj, filter_pattern, workers=workers, timeout_seconds=timeout)
+        progress.update(task, completed=len(tests))
+    report = report_results(results, fail_threshold=fail_threshold)
+    if tag:
+        out = _results_path(tag)
+        out.write_text(json.dumps(report, indent=2))
+        console.print(f"\n[dim]Results saved to {out}[/dim]")
+    total = report.get("total", 0)
+    passed = report.get("passed", 0)
+    if total > 0 and (passed / total) < fail_threshold:
+        sys.exit(1)
+@app.command()
+@click.option("--path", "-p", default=".", type=click.Path())
+def init(path: str) -> None:
+    """Scaffold agent_test_example.py, agenttest.toml, and GitHub Actions workflow."""
+    path_obj = Path(path).resolve()
+    path_obj.mkdir(parents=True, exist_ok=True)
+    example_content = '''"""Example agent eval tests. Replace my_agent with your agent."""
+from agenttest import eval, judge, record
+def my_agent(query: str) -> str:
+    """Mock agent - replace with your actual agent call."""
+    if "refund" in query.lower():
+        return (
+            "I'm sorry to hear you're unhappy. Our refund policy allows "
+            "returns within 30 days. Would you like me to start the refund process?"
+        )
+    return "How can I help you today?"
+@eval
+def test_customer_support_refund():
+    query = "I want a refund"
+    response = my_agent(query)
+    record(query, response)
+    assert judge.tone(response) == "empathetic"
+    assert judge.contains_action(response, "refund_policy")
+    assert judge.no_hallucination(response)
+@eval
+def test_customer_support_generic():
+    query = "What are your hours?"
+    response = my_agent(query)
+    record(query, response)
+    assert judge.relevance(response, "business hours") >= 0.3  # May be generic
+    assert not judge.toxicity(response)
+@eval
+def test_conciseness():
+    query = "Hi"
+    response = my_agent(query)
+    record(query, response)
+    assert judge.conciseness(response) in ("good", "too_short", "too_long")
+@eval
+def test_custom_score():
+    query = "I want a refund"
+    response = my_agent(query)
+    score = judge.score(
+        response,
+        criteria="Does the response show empathy and offer a clear next step?",
+    )
+    record(query, response, score)
+    assert score >= 0.5
+'''
+    toml_content = '''[agenttest]
+model = "claude-3-5-haiku-latest"
+timeout_seconds = 30
+workers = 4
+fail_threshold = 0.8
+cache = true
+[agenttest.env]
+ANTHROPIC_API_KEY = "$ANTHROPIC_API_KEY"
+'''
+    workflow_content = '''name: Agent Evals
+on:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+jobs:
+  eval:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - run: pip install agenttest-py
+      - run: agenttest run
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+'''
+    (path_obj / "agent_test_example.py").write_text(example_content)
+    (path_obj / "agenttest.toml").write_text(toml_content)
+    workflows_dir = path_obj / ".github" / "workflows"
+    workflows_dir.mkdir(parents=True, exist_ok=True)
+    (workflows_dir / "agenttest.yml").write_text(workflow_content)
+    console.print("[green]✓[/green] Created agent_test_example.py")
+    console.print("[green]✓[/green] Created agenttest.toml")
+    console.print("[green]✓[/green] Created .github/workflows/agenttest.yml")
+    console.print("\n[dim]Run: agenttest run[/dim]")
+    console.print("[dim]Set ANTHROPIC_API_KEY in your environment or agenttest.toml[/dim]")
+@app.command()
+def report() -> None:
+    """Show last run results from .agenttest_results.json."""
+    data = load_last_report()
+    if not data:
+        console.print("[yellow]No previous run found. Run 'agenttest run' first.[/yellow]")
+        sys.exit(1)
+    report_results(data.get("results", []), live=False)
+@app.command()
+@click.argument("v1")
+@click.argument("v2")
+def compare(v1: str, v2: str) -> None:
+    """Compare pass/fail metrics between two tagged runs."""
+    p1 = _results_path(v1)
+    p2 = _results_path(v2)
+    if not p1.is_file():
+        console.print(f"[red]No results found for tag '{v1}'. Run: agenttest run --tag {v1}[/red]")
+        sys.exit(1)
+    if not p2.is_file():
+        console.print(f"[red]No results found for tag '{v2}'. Run: agenttest run --tag {v2}[/red]")
+        sys.exit(1)
+    r1 = json.loads(p1.read_text())
+    r2 = json.loads(p2.read_text())
+    report_compare(r1, r2, tag_a=v1, tag_b=v2)
+@app.command()
+@click.argument("v1")
+@click.argument("v2")
+def diff(v1: str, v2: str) -> None:
+    """Show side-by-side diff of agent responses between two runs. The git diff for agent behavior."""
+    p1 = _results_path(v1)
+    p2 = _results_path(v2)
+    if not p1.is_file():
+        console.print(f"[red]No results found for tag '{v1}'. Run: agenttest run --tag {v1}[/red]")
+        sys.exit(1)
+    if not p2.is_file():
+        console.print(f"[red]No results found for tag '{v2}'. Run: agenttest run --tag {v2}[/red]")
+        sys.exit(1)
+    r1 = json.loads(p1.read_text())
+    r2 = json.loads(p2.read_text())
+    report_diff(r1, r2, tag_a=v1, tag_b=v2)
+def main() -> None:
+    app()
+if __name__ == "__main__":
+    main()

agenttest/config.py ADDED Viewed

@@ -0,0 +1,123 @@
+"""Configuration loader for agenttest."""
+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import Any
+try:
+    import toml
+except ImportError:
+    toml = None  # type: ignore[assignment]
+DEFAULT_CONFIG = {
+    "model": "claude-3-5-haiku-latest",
+    "timeout_seconds": 30,
+    "workers": 4,
+    "fail_threshold": 0.8,
+    "cache": True,
+}
+CONFIG_FILENAMES = ("agenttest.toml", "pyproject.toml")
+def _find_config_path(start: Path) -> Path | None:
+    """Find agenttest.toml or [tool.agenttest] in pyproject.toml."""
+    current = start.resolve()
+    for _ in range(10):  # Max 10 levels up
+        for name in CONFIG_FILENAMES:
+            path = current / name
+            if path.is_file():
+                if name == "agenttest.toml":
+                    return path
+                if name == "pyproject.toml":
+                    try:
+                        data = toml.load(path)
+                        if "tool" in data and "agenttest" in data["tool"]:
+                            return path
+                    except Exception:
+                        pass
+        parent = current.parent
+        if parent == current:
+            break
+        current = parent
+    return None
+def _load_toml(path: Path) -> dict[str, Any]:
+    """Load TOML file."""
+    if toml is None:
+        raise ImportError(
+            "toml package is required for config loading. Install with: pip install toml"
+        )
+    return toml.load(path)
+def _resolve_env(value: str) -> str:
+    """Resolve $VAR or ${VAR} in string. Missing vars become empty string."""
+    if not isinstance(value, str):
+        return value
+    if value.startswith("$") and len(value) > 1:
+        var = value[1:]
+        if var.startswith("{"):
+            var = var[1:-1]
+        return os.environ.get(var, "")
+    return value
+def load_config(start_dir: Path | str | None = None) -> dict[str, Any]:
+    """
+    Load agenttest configuration from agenttest.toml or pyproject.toml.
+    Searches upward from start_dir (default: cwd) for config files.
+    Environment variables in config values (e.g. $ANTHROPIC_API_KEY) are resolved.
+    Returns:
+        Merged config dict with defaults.
+    """
+    start = Path(start_dir or os.getcwd())
+    config: dict[str, Any] = {**DEFAULT_CONFIG}
+    path = _find_config_path(start)
+    if path:
+        raw = _load_toml(path)
+        if path.name == "pyproject.toml":
+            agent = raw.get("tool", {}).get("agenttest", {})
+        else:
+            agent = raw.get("agenttest", {})
+        # Merge [agenttest] section
+        for key, value in agent.items():
+            if key == "env":
+                continue
+            if value is not None:
+                config[key] = value
+        # Resolve [agenttest.env]
+        env_section = agent.get("env", {})
+        for key, val in env_section.items():
+            if isinstance(val, str):
+                config.setdefault("env", {})[key] = _resolve_env(val)
+            else:
+                config.setdefault("env", {})[key] = val
+    return config
+def get_api_key(config: dict[str, Any] | None = None) -> str:
+    """
+    Get Anthropic API key from config env or ANTHROPIC_API_KEY.
+    Raises:
+        ValueError: If no API key is found.
+    """
+    if config is None:
+        config = load_config()
+    env = config.get("env", {})
+    key = env.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_API_KEY")
+    if not key:
+        raise ValueError(
+            "ANTHROPIC_API_KEY not set. Set it in your environment or in agenttest.toml under agenttest.env."
+        )
+    return str(key)