PyPI - evalpilot - Versions diffs - 0.1.0__tar.gz - Mend

evalpilot 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

evalpilot-0.1.0/LICENSE +21 -0
evalpilot-0.1.0/PKG-INFO +144 -0
evalpilot-0.1.0/README.md +114 -0
evalpilot-0.1.0/eval_agent/__init__.py +1 -0
evalpilot-0.1.0/eval_agent/adapters/__init__.py +39 -0
evalpilot-0.1.0/eval_agent/adapters/cli.py +79 -0
evalpilot-0.1.0/eval_agent/adapters/function.py +61 -0
evalpilot-0.1.0/eval_agent/adapters/http.py +78 -0
evalpilot-0.1.0/eval_agent/adapters/webui.py +197 -0
evalpilot-0.1.0/eval_agent/aggregate.py +57 -0
evalpilot-0.1.0/eval_agent/cli.py +106 -0
evalpilot-0.1.0/eval_agent/examples/__init__.py +0 -0
evalpilot-0.1.0/eval_agent/examples/echo.py +11 -0
evalpilot-0.1.0/eval_agent/loader.py +35 -0
evalpilot-0.1.0/eval_agent/models.py +79 -0
evalpilot-0.1.0/eval_agent/reporter.py +70 -0
evalpilot-0.1.0/eval_agent/runner.py +37 -0
evalpilot-0.1.0/eval_agent/sandbox.py +43 -0
evalpilot-0.1.0/eval_agent/scorers/__init__.py +34 -0
evalpilot-0.1.0/eval_agent/scorers/checkpoint.py +66 -0
evalpilot-0.1.0/eval_agent/scorers/efficiency.py +19 -0
evalpilot-0.1.0/eval_agent/scorers/llm_judge.py +57 -0
evalpilot-0.1.0/eval_agent/scorers/trajectory.py +50 -0
evalpilot-0.1.0/eval_agent/templates/index.html.j2 +73 -0
evalpilot-0.1.0/eval_agent/templates/report.html.j2 +68 -0
evalpilot-0.1.0/eval_agent/wizard.py +132 -0
evalpilot-0.1.0/evalpilot.egg-info/PKG-INFO +144 -0
evalpilot-0.1.0/evalpilot.egg-info/SOURCES.txt +49 -0
evalpilot-0.1.0/evalpilot.egg-info/dependency_links.txt +1 -0
evalpilot-0.1.0/evalpilot.egg-info/entry_points.txt +4 -0
evalpilot-0.1.0/evalpilot.egg-info/requires.txt +11 -0
evalpilot-0.1.0/evalpilot.egg-info/top_level.txt +1 -0
evalpilot-0.1.0/pyproject.toml +46 -0
evalpilot-0.1.0/setup.cfg +4 -0
evalpilot-0.1.0/tests/test_adapter_cli.py +47 -0
evalpilot-0.1.0/tests/test_adapter_function.py +34 -0
evalpilot-0.1.0/tests/test_adapter_http.py +48 -0
evalpilot-0.1.0/tests/test_adapter_webui.py +210 -0
evalpilot-0.1.0/tests/test_aggregate.py +44 -0
evalpilot-0.1.0/tests/test_cli.py +159 -0
evalpilot-0.1.0/tests/test_index.py +45 -0
evalpilot-0.1.0/tests/test_loader.py +37 -0
evalpilot-0.1.0/tests/test_models.py +31 -0
evalpilot-0.1.0/tests/test_reporter.py +43 -0
evalpilot-0.1.0/tests/test_runner.py +58 -0
evalpilot-0.1.0/tests/test_sandbox.py +25 -0
evalpilot-0.1.0/tests/test_scorer_checkpoint.py +103 -0
evalpilot-0.1.0/tests/test_scorer_efficiency.py +29 -0
evalpilot-0.1.0/tests/test_scorer_llm_judge.py +36 -0
evalpilot-0.1.0/tests/test_scorer_trajectory.py +44 -0
evalpilot-0.1.0/tests/test_wizard.py +122 -0

evalpilot-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 ailearneryang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

evalpilot-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,144 @@
+Metadata-Version: 2.4
+Name: evalpilot
+Version: 0.1.0
+Summary: A general framework for online-driving, recording, and scoring AI agents against task suites.
+Author-email: ailearneryang <ailearneryang@gmail.com>
+License: MIT
+Keywords: agent,evaluation,eval,llm,benchmark,testing,cli
+Classifier: Development Status :: 3 - Alpha
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Software Development :: Testing
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pydantic>=2.6
+Requires-Dist: PyYAML>=6.0
+Requires-Dist: jinja2>=3.1
+Requires-Dist: click>=8.1
+Requires-Dist: anthropic>=0.40
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0; extra == "dev"
+Provides-Extra: webui
+Requires-Dist: playwright>=1.40; extra == "webui"
+Dynamic: license-file
+# eval-agent
+A general framework for evaluating agents. It online-drives any agent through a
+task suite, records its trajectory, scores it with four pluggable scorers
+(objective checkpoints, LLM-as-judge, trajectory/process, efficiency), and emits
+a JSON + static HTML report.
+## Install
+Published on PyPI as `evalpilot` (the CLI is available as `eval-agent`,
+`evalagent`, or `evalpilot` — all the same command).
+Run with no install (recommended):
+    uvx evalpilot run
+Install as a tool:
+    uv tool install evalpilot      # or: pipx install evalpilot
+For browser-UI (`webui`) agents, install the extra and the browser once:
+    uv tool install "evalpilot[webui]"
+    playwright install chromium
+From source (development):
+    python -m venv .venv
+    .venv/bin/pip install -e ".[dev]"
+## Quick start
+    .venv/bin/eval-agent run --suite suites/example.yaml --agent configs/function-echo.yaml --run-id demo --no-llm-judge
+    open runs/demo/report.html
+Or run it interactively — omit `--suite`/`--agent` and a wizard walks you through
+defining the agent and tasks (no YAML needed):
+    eval-agent run
+Browse all runs at once:
+    open runs/index.html
+## Concepts
+- **Task / TaskSuite** — declarative units of evaluation (prompt, setup,
+  checkpoints, rubric, budget).
+- **AgentAdapter** — narrow interface that drives the agent under test. Built-in:
+  `function`, `cli`, `http`, `webui`. Add a new agent by writing one subclass.
+- **Trajectory** — the recorded run (steps, output, usage, final state). Sole
+  data source for all scorers.
+- **Scorers** — checkpoint, llm_judge, trajectory, efficiency. Read-only,
+  pluggable, weighted. Checkpoints include file/command checks plus
+  `output_contains` / `output_matches` for grading text answers objectively.
+- **Runner / Reporter** — orchestrate and report.
+## Adding an agent
+Write a YAML config selecting an adapter type. For a CLI agent:
+    type: cli
+    name: my-agent
+    command: my-agent --prompt {prompt} --json-stream
+For an in-process Python callable:
+    type: function
+    name: my-agent
+    entrypoint: my_module:run
+For an OpenAI-style HTTP endpoint:
+    type: http
+    name: my-agent
+    url: http://localhost:8000/v1/chat/completions
+For a browser-based (web UI) agent:
+    type: webui
+    name: my-chat-agent
+    url: https://my-agent.example.com/chat
+    input_selector: "textarea"
+    output_selector: ".message.assistant:last-child"
+The `webui` adapter needs Playwright (an optional extra):
+    pip install -e ".[webui]"
+    playwright install chromium
+It types the prompt into `input_selector`, waits until the reply text in
+`output_selector` stops changing (default 1.5s), then scrapes it. Reuse a
+logged-in session with `storage_state: ./auth.json`.
+## Scoring weights
+Defaults: checkpoint=3, llm_judge=2, trajectory=1, efficiency=0. Override per
+suite (`default_weights`) or per task (`weights`). Each scorer's sub-score and
+its reason are always preserved in the report, not just the aggregate.
+## Commands
+    eval-agent run --suite SUITE.yaml --agent AGENT.yaml [--run-id ID] [--out DIR] [--no-llm-judge]
+    eval-agent run                            # interactive wizard (builds agent + tasks)
+    eval-agent report --run runs/ID           # re-render report.html from result.json
+    eval-agent compare runs/a runs/b          # combined leaderboard across runs
+    eval-agent index [--out runs]             # overview page listing all runs -> runs/index.html
+## Testing
+    .venv/bin/python -m pytest
+All scorer/adapter tests inject fakes (fake CLI scripts, fake HTTP post, fake
+LLM completion), so the suite runs fully offline with no network or API key.

evalpilot-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,114 @@
+# eval-agent
+A general framework for evaluating agents. It online-drives any agent through a
+task suite, records its trajectory, scores it with four pluggable scorers
+(objective checkpoints, LLM-as-judge, trajectory/process, efficiency), and emits
+a JSON + static HTML report.
+## Install
+Published on PyPI as `evalpilot` (the CLI is available as `eval-agent`,
+`evalagent`, or `evalpilot` — all the same command).
+Run with no install (recommended):
+    uvx evalpilot run
+Install as a tool:
+    uv tool install evalpilot      # or: pipx install evalpilot
+For browser-UI (`webui`) agents, install the extra and the browser once:
+    uv tool install "evalpilot[webui]"
+    playwright install chromium
+From source (development):
+    python -m venv .venv
+    .venv/bin/pip install -e ".[dev]"
+## Quick start
+    .venv/bin/eval-agent run --suite suites/example.yaml --agent configs/function-echo.yaml --run-id demo --no-llm-judge
+    open runs/demo/report.html
+Or run it interactively — omit `--suite`/`--agent` and a wizard walks you through
+defining the agent and tasks (no YAML needed):
+    eval-agent run
+Browse all runs at once:
+    open runs/index.html
+## Concepts
+- **Task / TaskSuite** — declarative units of evaluation (prompt, setup,
+  checkpoints, rubric, budget).
+- **AgentAdapter** — narrow interface that drives the agent under test. Built-in:
+  `function`, `cli`, `http`, `webui`. Add a new agent by writing one subclass.
+- **Trajectory** — the recorded run (steps, output, usage, final state). Sole
+  data source for all scorers.
+- **Scorers** — checkpoint, llm_judge, trajectory, efficiency. Read-only,
+  pluggable, weighted. Checkpoints include file/command checks plus
+  `output_contains` / `output_matches` for grading text answers objectively.
+- **Runner / Reporter** — orchestrate and report.
+## Adding an agent
+Write a YAML config selecting an adapter type. For a CLI agent:
+    type: cli
+    name: my-agent
+    command: my-agent --prompt {prompt} --json-stream
+For an in-process Python callable:
+    type: function
+    name: my-agent
+    entrypoint: my_module:run
+For an OpenAI-style HTTP endpoint:
+    type: http
+    name: my-agent
+    url: http://localhost:8000/v1/chat/completions
+For a browser-based (web UI) agent:
+    type: webui
+    name: my-chat-agent
+    url: https://my-agent.example.com/chat
+    input_selector: "textarea"
+    output_selector: ".message.assistant:last-child"
+The `webui` adapter needs Playwright (an optional extra):
+    pip install -e ".[webui]"
+    playwright install chromium
+It types the prompt into `input_selector`, waits until the reply text in
+`output_selector` stops changing (default 1.5s), then scrapes it. Reuse a
+logged-in session with `storage_state: ./auth.json`.
+## Scoring weights
+Defaults: checkpoint=3, llm_judge=2, trajectory=1, efficiency=0. Override per
+suite (`default_weights`) or per task (`weights`). Each scorer's sub-score and
+its reason are always preserved in the report, not just the aggregate.
+## Commands
+    eval-agent run --suite SUITE.yaml --agent AGENT.yaml [--run-id ID] [--out DIR] [--no-llm-judge]
+    eval-agent run                            # interactive wizard (builds agent + tasks)
+    eval-agent report --run runs/ID           # re-render report.html from result.json
+    eval-agent compare runs/a runs/b          # combined leaderboard across runs
+    eval-agent index [--out runs]             # overview page listing all runs -> runs/index.html
+## Testing
+    .venv/bin/python -m pytest
+All scorer/adapter tests inject fakes (fake CLI scripts, fake HTTP post, fake
+LLM completion), so the suite runs fully offline with no network or API key.

evalpilot-0.1.0/eval_agent/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.0"

evalpilot-0.1.0/eval_agent/adapters/__init__.py ADDED Viewed

@@ -0,0 +1,39 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from pathlib import Path
+from eval_agent.models import Task, Trajectory
+class AgentAdapter(ABC):
+    """Narrow interface: drive the agent under test for one task.
+    The framework only ever calls .run(); how the adapter invokes the
+    underlying agent (subprocess, HTTP, in-process function) is internal.
+    """
+    name: str = "agent"
+    @abstractmethod
+    def run(self, task: Task, workspace: Path) -> Trajectory:
+        ...
+def build_adapter(config: dict) -> AgentAdapter:
+    """Factory: pick an adapter implementation from a config dict.
+    config["type"] in {"function", "cli", "http", "webui"}.
+    """
+    atype = config["type"]
+    if atype == "cli":
+        from eval_agent.adapters.cli import CLIAgentAdapter
+        return CLIAgentAdapter.from_config(config)
+    if atype == "http":
+        from eval_agent.adapters.http import HTTPAgentAdapter
+        return HTTPAgentAdapter.from_config(config)
+    if atype == "function":
+        from eval_agent.adapters.function import FunctionAgentAdapter
+        return FunctionAgentAdapter.from_config(config)
+    if atype == "webui":
+        from eval_agent.adapters.webui import WebUIAgentAdapter
+        return WebUIAgentAdapter.from_config(config)
+    raise ValueError(f"unknown adapter type: {atype}")

evalpilot-0.1.0/eval_agent/adapters/cli.py ADDED Viewed

@@ -0,0 +1,79 @@
+from __future__ import annotations
+import json
+import shlex
+import subprocess
+import time
+from pathlib import Path
+from eval_agent.adapters import AgentAdapter
+from eval_agent.models import Task, Trajectory, Step, Usage
+class CLIAgentAdapter(AgentAdapter):
+    """Drives a command-line agent that emits a JSONL stream on stdout.
+    command_template may contain {prompt}; it is substituted (shell-quoted)
+    with the task prompt. The process runs with cwd=workspace.
+    """
+    name = "cli-agent"
+    def __init__(self, command_template: str, name: str = "cli-agent"):
+        self.command_template = command_template
+        self.name = name
+    @classmethod
+    def from_config(cls, config: dict) -> "CLIAgentAdapter":
+        return cls(command_template=config["command"],
+                   name=config.get("name", "cli-agent"))
+    def run(self, task: Task, workspace: Path) -> Trajectory:
+        cmd = self.command_template.format(prompt=shlex.quote(task.prompt))
+        steps: list[Step] = []
+        final_output = ""
+        total_tokens = 0
+        start = time.monotonic()
+        try:
+            proc = subprocess.run(
+                cmd, shell=True, cwd=str(workspace),
+                capture_output=True, text=True,
+                timeout=task.budget.timeout_seconds,
+            )
+        except subprocess.TimeoutExpired:
+            return Trajectory(
+                steps=steps,
+                usage=Usage(wall_seconds=time.monotonic() - start,
+                            num_steps=len(steps)),
+                status="timeout",
+                error="process exceeded timeout",
+            )
+        for line in proc.stdout.splitlines():
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            t = obj.get("type")
+            if t == "tool_use":
+                steps.append(Step(index=len(steps), kind="tool_call",
+                                  name=obj.get("name"),
+                                  payload=obj.get("input", {})))
+            elif t == "text":
+                steps.append(Step(index=len(steps), kind="message",
+                                  payload={"text": obj.get("text", "")}))
+            elif t == "result":
+                final_output = obj.get("output", "")
+                total_tokens = obj.get("usage", {}).get("total_tokens", 0)
+        status = "success" if proc.returncode == 0 else "crash"
+        return Trajectory(
+            steps=steps,
+            final_output=final_output,
+            usage=Usage(total_tokens=total_tokens,
+                        wall_seconds=time.monotonic() - start,
+                        num_steps=len(steps)),
+            status=status,
+            error=None if status == "success" else proc.stderr[-2000:],
+        )

evalpilot-0.1.0/eval_agent/adapters/function.py ADDED Viewed

@@ -0,0 +1,61 @@
+from __future__ import annotations
+import importlib
+import time
+from pathlib import Path
+from typing import Callable, Optional
+from eval_agent.adapters import AgentAdapter
+from eval_agent.models import Task, Trajectory, Step, Usage
+class FunctionAgentAdapter(AgentAdapter):
+    """Drives an in-process Python callable.
+    The callable signature is fn(task, workspace, record) where `record`
+    is record(kind, name, payload) and it may return a dict with optional
+    keys: final_output (str), tokens (int), retries (int).
+    """
+    name = "function-agent"
+    def __init__(self, fn: Callable, name: Optional[str] = None):
+        self.fn = fn
+        if name:
+            self.name = name
+    @classmethod
+    def from_config(cls, config: dict) -> "FunctionAgentAdapter":
+        # config["entrypoint"] = "module.path:callable"
+        module_path, attr = config["entrypoint"].split(":")
+        fn = getattr(importlib.import_module(module_path), attr)
+        return cls(fn=fn, name=config.get("name", "function-agent"))
+    def run(self, task: Task, workspace: Path) -> Trajectory:
+        steps: list[Step] = []
+        def record(kind: str, name=None, payload=None):
+            steps.append(Step(index=len(steps), kind=kind, name=name,
+                              payload=payload or {}))
+        start = time.monotonic()
+        try:
+            ret = self.fn(task, str(workspace), record) or {}
+            elapsed = time.monotonic() - start
+            return Trajectory(
+                steps=steps,
+                final_output=ret.get("final_output", ""),
+                usage=Usage(
+                    total_tokens=ret.get("tokens", 0),
+                    wall_seconds=elapsed,
+                    num_steps=len(steps),
+                    num_retries=ret.get("retries", 0),
+                ),
+                status="success",
+            )
+        except Exception as e:
+            elapsed = time.monotonic() - start
+            return Trajectory(
+                steps=steps,
+                usage=Usage(wall_seconds=elapsed, num_steps=len(steps)),
+                status="crash",
+                error=f"{type(e).__name__}: {e}",
+            )

evalpilot-0.1.0/eval_agent/adapters/http.py ADDED Viewed

@@ -0,0 +1,78 @@
+from __future__ import annotations
+import json as jsonlib
+import time
+from pathlib import Path
+from typing import Callable, Optional
+from eval_agent.adapters import AgentAdapter
+from eval_agent.models import Task, Trajectory, Step, Usage
+def _default_post(url: str, json_body: dict, timeout: Optional[float]):
+    import urllib.request
+    data = jsonlib.dumps(json_body).encode()
+    req = urllib.request.Request(url, data=data,
+                                 headers={"Content-Type": "application/json"})
+    with urllib.request.urlopen(req, timeout=timeout) as resp:
+        return jsonlib.loads(resp.read().decode())
+class HTTPAgentAdapter(AgentAdapter):
+    """Drives an OpenAI-style chat endpoint.
+    `post(url, json_body, timeout) -> dict` is injectable for testing.
+    """
+    name = "http-agent"
+    def __init__(self, url: str, post: Callable = _default_post,
+                 name: str = "http-agent"):
+        self.url = url
+        self.post = post
+        self.name = name
+    @classmethod
+    def from_config(cls, config: dict) -> "HTTPAgentAdapter":
+        return cls(url=config["url"], name=config.get("name", "http-agent"))
+    def run(self, task: Task, workspace: Path) -> Trajectory:
+        body = {"messages": [{"role": "user", "content": task.prompt}]}
+        start = time.monotonic()
+        try:
+            resp = self.post(self.url, body, task.budget.timeout_seconds)
+        except Exception as e:
+            return Trajectory(
+                usage=Usage(wall_seconds=time.monotonic() - start),
+                status="crash",
+                error=f"{type(e).__name__}: {e}",
+            )
+        try:
+            message = resp["choices"][0]["message"]
+            steps: list[Step] = []
+            text = message.get("content") or ""
+            if text:
+                steps.append(Step(index=len(steps), kind="message",
+                                  payload={"text": text}))
+            for tc in message.get("tool_calls", []) or []:
+                fn = tc.get("function", {})
+                try:
+                    args = jsonlib.loads(fn.get("arguments", "{}"))
+                except jsonlib.JSONDecodeError:
+                    args = {"raw": fn.get("arguments")}
+                steps.append(Step(index=len(steps), kind="tool_call",
+                                  name=fn.get("name"), payload=args))
+        except Exception as e:
+            return Trajectory(
+                usage=Usage(wall_seconds=time.monotonic() - start),
+                status="crash",
+                error=f"{type(e).__name__}: {e}",
+            )
+        return Trajectory(
+            steps=steps,
+            final_output=text,
+            usage=Usage(total_tokens=resp.get("usage", {}).get("total_tokens", 0),
+                        wall_seconds=time.monotonic() - start,
+                        num_steps=len(steps)),
+            status="success",
+        )