ecp-runtime 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ecp_runtime-0.2.1/.gitignore +207 -0
- ecp_runtime-0.2.1/PKG-INFO +18 -0
- ecp_runtime-0.2.1/README.md +4 -0
- ecp_runtime-0.2.1/pyproject.toml +24 -0
- ecp_runtime-0.2.1/src/ecp_runtime/__init__.py +1 -0
- ecp_runtime-0.2.1/src/ecp_runtime/cli.py +141 -0
- ecp_runtime-0.2.1/src/ecp_runtime/graders.py +168 -0
- ecp_runtime-0.2.1/src/ecp_runtime/manifest.py +41 -0
- ecp_runtime-0.2.1/src/ecp_runtime/reporter.py +70 -0
- ecp_runtime-0.2.1/src/ecp_runtime/runner.py +214 -0
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
#poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
#pdm.lock
|
|
116
|
+
#pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
#pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# SageMath parsed files
|
|
135
|
+
*.sage.py
|
|
136
|
+
|
|
137
|
+
# Environments
|
|
138
|
+
.env
|
|
139
|
+
.envrc
|
|
140
|
+
.venv
|
|
141
|
+
env/
|
|
142
|
+
venv/
|
|
143
|
+
ENV/
|
|
144
|
+
env.bak/
|
|
145
|
+
venv.bak/
|
|
146
|
+
|
|
147
|
+
# Spyder project settings
|
|
148
|
+
.spyderproject
|
|
149
|
+
.spyproject
|
|
150
|
+
|
|
151
|
+
# Rope project settings
|
|
152
|
+
.ropeproject
|
|
153
|
+
|
|
154
|
+
# mkdocs documentation
|
|
155
|
+
/site
|
|
156
|
+
|
|
157
|
+
# mypy
|
|
158
|
+
.mypy_cache/
|
|
159
|
+
.dmypy.json
|
|
160
|
+
dmypy.json
|
|
161
|
+
|
|
162
|
+
# Pyre type checker
|
|
163
|
+
.pyre/
|
|
164
|
+
|
|
165
|
+
# pytype static type analyzer
|
|
166
|
+
.pytype/
|
|
167
|
+
|
|
168
|
+
# Cython debug symbols
|
|
169
|
+
cython_debug/
|
|
170
|
+
|
|
171
|
+
# PyCharm
|
|
172
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
173
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
174
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
175
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
176
|
+
#.idea/
|
|
177
|
+
|
|
178
|
+
# Abstra
|
|
179
|
+
# Abstra is an AI-powered process automation framework.
|
|
180
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
181
|
+
# Learn more at https://abstra.io/docs
|
|
182
|
+
.abstra/
|
|
183
|
+
|
|
184
|
+
# Visual Studio Code
|
|
185
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
186
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
188
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
189
|
+
# .vscode/
|
|
190
|
+
|
|
191
|
+
# Ruff stuff:
|
|
192
|
+
.ruff_cache/
|
|
193
|
+
|
|
194
|
+
# PyPI configuration file
|
|
195
|
+
.pypirc
|
|
196
|
+
|
|
197
|
+
# Cursor
|
|
198
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
|
199
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
200
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
201
|
+
.cursorignore
|
|
202
|
+
.cursorindexingignore
|
|
203
|
+
|
|
204
|
+
# Marimo
|
|
205
|
+
marimo/_static/
|
|
206
|
+
marimo/_lsp/
|
|
207
|
+
__marimo__/
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ecp-runtime
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: The reference runtime for the Evaluation Context Protocol (ECP).
|
|
5
|
+
Author-email: ECP Maintainers <hello@ecp.org>
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Requires-Dist: jinja2>=3.1.0
|
|
8
|
+
Requires-Dist: openai>=1.0.0
|
|
9
|
+
Requires-Dist: pydantic>=2.0
|
|
10
|
+
Requires-Dist: pyyaml>=6.0
|
|
11
|
+
Requires-Dist: rich>=13.0
|
|
12
|
+
Requires-Dist: typer>=0.9.0
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# ECP Runtime
|
|
16
|
+
|
|
17
|
+
This is the reference implementation of the Evaluation Context Protocol (ECP) Runtime.
|
|
18
|
+
It includes the CLI tool `ecp` for running agent evaluations.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ecp-runtime"
|
|
7
|
+
version = "0.2.1"
|
|
8
|
+
description = "The reference runtime for the Evaluation Context Protocol (ECP)."
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "ECP Maintainers", email = "hello@ecp.org" },
|
|
11
|
+
]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.9"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"typer>=0.9.0", # For the CLI
|
|
16
|
+
"pyyaml>=6.0", # For reading Manifests
|
|
17
|
+
"pydantic>=2.0", # For validating Schemas
|
|
18
|
+
"rich>=13.0", # For pretty terminal output
|
|
19
|
+
"openai>=1.0.0", # For the LLM Judge
|
|
20
|
+
"Jinja2>=3.1.0", # For HTML report rendering
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.scripts]
|
|
24
|
+
ecp = "ecp_runtime.cli:app"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import typer
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, Any, Optional
|
|
8
|
+
|
|
9
|
+
from .reporter import HTMLReporter
|
|
10
|
+
|
|
11
|
+
# Import local modules (Using relative imports)
|
|
12
|
+
try:
|
|
13
|
+
from .manifest import ECPManifest
|
|
14
|
+
from .runner import ECPRunner
|
|
15
|
+
except ImportError:
|
|
16
|
+
# Fallback for direct execution debugging
|
|
17
|
+
sys.path.append(os.path.dirname(__file__))
|
|
18
|
+
from manifest import ECPManifest
|
|
19
|
+
from runner import ECPRunner
|
|
20
|
+
|
|
21
|
+
app = typer.Typer(
|
|
22
|
+
name="ecp",
|
|
23
|
+
help="Evaluation Context Protocol Runtime CLI",
|
|
24
|
+
add_completion=False
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _configure_logging(verbose: bool) -> None:
|
|
31
|
+
level = logging.DEBUG if verbose else logging.INFO
|
|
32
|
+
logging.basicConfig(
|
|
33
|
+
level=level,
|
|
34
|
+
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
|
35
|
+
datefmt="%H:%M:%S",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@app.callback()
|
|
40
|
+
def main():
|
|
41
|
+
"""
|
|
42
|
+
Official Runtime for the Evaluation Context Protocol (ECP).
|
|
43
|
+
"""
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@app.command()
|
|
48
|
+
def run(
|
|
49
|
+
manifest: Path = typer.Option(
|
|
50
|
+
...,
|
|
51
|
+
"--manifest", "-m",
|
|
52
|
+
exists=True,
|
|
53
|
+
file_okay=True,
|
|
54
|
+
dir_okay=False,
|
|
55
|
+
writable=False,
|
|
56
|
+
readable=True,
|
|
57
|
+
resolve_path=True,
|
|
58
|
+
help="Path to the test manifest YAML file"
|
|
59
|
+
),
|
|
60
|
+
|
|
61
|
+
verbose: bool = False,
|
|
62
|
+
report: Optional[Path] = typer.Option(
|
|
63
|
+
None,
|
|
64
|
+
"--report",
|
|
65
|
+
help="Path to save an HTML report (e.g., output.html)",
|
|
66
|
+
resolve_path=True,
|
|
67
|
+
),
|
|
68
|
+
json_out: Optional[Path] = typer.Option(
|
|
69
|
+
None,
|
|
70
|
+
"--json-out",
|
|
71
|
+
help="Path to save a JSON report (e.g., output.json)",
|
|
72
|
+
resolve_path=True,
|
|
73
|
+
),
|
|
74
|
+
print_json: bool = typer.Option(
|
|
75
|
+
False,
|
|
76
|
+
"--json",
|
|
77
|
+
help="Print the JSON report to stdout",
|
|
78
|
+
),
|
|
79
|
+
fail_on_error: bool = typer.Option(
|
|
80
|
+
True,
|
|
81
|
+
"--fail-on-error/--no-fail-on-error",
|
|
82
|
+
help="Exit non-zero if any checks fail (useful for CI)",
|
|
83
|
+
),
|
|
84
|
+
):
|
|
85
|
+
"""
|
|
86
|
+
Execute an evaluation run based on a manifest file.
|
|
87
|
+
"""
|
|
88
|
+
_configure_logging(verbose)
|
|
89
|
+
|
|
90
|
+
logger.info("ECP Runtime Initializing...")
|
|
91
|
+
logger.info("Loading manifest: %s", manifest)
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
# Load the YAML
|
|
95
|
+
config = ECPManifest.from_yaml(str(manifest))
|
|
96
|
+
|
|
97
|
+
# Run the Tests
|
|
98
|
+
runner = ECPRunner(config)
|
|
99
|
+
result_summary = runner.run_scenarios()
|
|
100
|
+
total = int(result_summary.get("total", 0) or 0)
|
|
101
|
+
passed = int(result_summary.get("passed", 0) or 0)
|
|
102
|
+
failed = max(total - passed, 0)
|
|
103
|
+
|
|
104
|
+
report_payload: Dict[str, Any] = {
|
|
105
|
+
"manifest": str(manifest),
|
|
106
|
+
"passed": passed,
|
|
107
|
+
"total": total,
|
|
108
|
+
"failed": failed,
|
|
109
|
+
"scenarios": result_summary.get("scenarios", []),
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if report:
|
|
113
|
+
logger.info("Generating HTML report: %s", report)
|
|
114
|
+
reporter = HTMLReporter()
|
|
115
|
+
# Feed scenarios directly to reporter
|
|
116
|
+
for scenario in result_summary.get("scenarios", []):
|
|
117
|
+
reporter.add_scenario(scenario.get("name"), scenario.get("steps", []))
|
|
118
|
+
reporter.save(str(report))
|
|
119
|
+
logger.info("Report saved to %s", report)
|
|
120
|
+
|
|
121
|
+
if json_out:
|
|
122
|
+
json_out.write_text(json.dumps(report_payload, indent=2), encoding="utf-8")
|
|
123
|
+
logger.info("JSON report saved to %s", json_out)
|
|
124
|
+
|
|
125
|
+
if print_json:
|
|
126
|
+
typer.echo(json.dumps(report_payload, indent=2))
|
|
127
|
+
|
|
128
|
+
if fail_on_error and failed > 0:
|
|
129
|
+
raise typer.Exit(code=2)
|
|
130
|
+
|
|
131
|
+
except typer.Exit:
|
|
132
|
+
raise
|
|
133
|
+
except Exception as e:
|
|
134
|
+
logger.error("CRITICAL ERROR: %s", e)
|
|
135
|
+
if verbose:
|
|
136
|
+
raise e
|
|
137
|
+
sys.exit(1)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
if __name__ == "__main__":
|
|
141
|
+
app()
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Dict, List, Tuple
|
|
4
|
+
from .manifest import GraderConfig, StepConfig
|
|
5
|
+
|
|
6
|
+
# Try importing OpenAI, but don't crash if it's missing (unless used)
|
|
7
|
+
try:
|
|
8
|
+
from openai import OpenAI
|
|
9
|
+
except ImportError:
|
|
10
|
+
OpenAI = None
|
|
11
|
+
|
|
12
|
+
def check_text_match(grader: GraderConfig, text: str) -> Tuple[bool, str]:
|
|
13
|
+
"""Handles simple string assertions."""
|
|
14
|
+
if not text:
|
|
15
|
+
return False, "Text was empty"
|
|
16
|
+
|
|
17
|
+
val = grader.value
|
|
18
|
+
if grader.condition in {"contains", "equals", "does_not_contain"} and val is None:
|
|
19
|
+
return False, "No 'value' provided for text_match condition"
|
|
20
|
+
if grader.condition == "regex" and not grader.pattern:
|
|
21
|
+
return False, "No 'pattern' provided for regex condition"
|
|
22
|
+
|
|
23
|
+
if grader.condition == "contains":
|
|
24
|
+
return val in text, f"Expected to contain '{val}'"
|
|
25
|
+
elif grader.condition == "equals":
|
|
26
|
+
return val == text, f"Expected to equal '{val}'"
|
|
27
|
+
elif grader.condition == "does_not_contain":
|
|
28
|
+
return val not in text, f"Expected NOT to contain '{val}'"
|
|
29
|
+
elif grader.condition == "regex":
|
|
30
|
+
return bool(re.search(grader.pattern, text)), f"Regex '{grader.pattern}' failed"
|
|
31
|
+
return False, "Unknown condition"
|
|
32
|
+
|
|
33
|
+
def check_llm_judge(grader: GraderConfig, text: str) -> Tuple[bool, str, float]:
|
|
34
|
+
"""
|
|
35
|
+
Uses an LLM to evaluate the text.
|
|
36
|
+
Returns: (passed, reasoning, score)
|
|
37
|
+
"""
|
|
38
|
+
if not grader.prompt:
|
|
39
|
+
return False, "No prompt provided for llm_judge", 0.0
|
|
40
|
+
if OpenAI is None:
|
|
41
|
+
return False, "OpenAI library not installed. Run 'pip install openai'", 0.0
|
|
42
|
+
|
|
43
|
+
api_key = os.environ.get("OPENAI_API_KEY")
|
|
44
|
+
if not api_key:
|
|
45
|
+
return False, "OPENAI_API_KEY not set in environment", 0.0
|
|
46
|
+
|
|
47
|
+
client = OpenAI(api_key=api_key)
|
|
48
|
+
|
|
49
|
+
# 1. Construct the Prompt for the Judge
|
|
50
|
+
system_prompt = "You are an impartial AI Judge. You evaluate outputs based on specific criteria."
|
|
51
|
+
user_prompt = f"""
|
|
52
|
+
TASK: Evaluate the following text against the provided criteria.
|
|
53
|
+
|
|
54
|
+
[TEXT TO EVALUATE]
|
|
55
|
+
{text}
|
|
56
|
+
|
|
57
|
+
[CRITERIA]
|
|
58
|
+
{grader.prompt}
|
|
59
|
+
|
|
60
|
+
[ASSERTION]
|
|
61
|
+
Does the text satisfy the criteria?
|
|
62
|
+
If YES, end your response with "RESULT: PASS".
|
|
63
|
+
If NO, end your response with "RESULT: FAIL".
|
|
64
|
+
Provide a short reasoning before the result.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
# 2. Call the Judge (using a cheap, smart model)
|
|
68
|
+
try:
|
|
69
|
+
response = client.chat.completions.create(
|
|
70
|
+
model="gpt-4o-mini", # Fast, cheap, smart enough for grading
|
|
71
|
+
messages=[
|
|
72
|
+
{"role": "system", "content": system_prompt},
|
|
73
|
+
{"role": "user", "content": user_prompt}
|
|
74
|
+
],
|
|
75
|
+
temperature=0.0
|
|
76
|
+
)
|
|
77
|
+
content = response.choices[0].message.content
|
|
78
|
+
|
|
79
|
+
# 3. Parse the Verdict
|
|
80
|
+
passed = "RESULT: PASS" in content
|
|
81
|
+
reasoning = content.replace("RESULT: PASS", "").replace("RESULT: FAIL", "").strip()
|
|
82
|
+
|
|
83
|
+
return passed, reasoning, 1.0 if passed else 0.0
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
return False, f"LLM Judge Error: {str(e)}", 0.0
|
|
87
|
+
|
|
88
|
+
def evaluate_step(step_config: StepConfig, result_obj: Any) -> List[Dict[str, Any]]:
|
|
89
|
+
"""
|
|
90
|
+
Runs all graders for a single step against the agent's result.
|
|
91
|
+
"""
|
|
92
|
+
check_results = []
|
|
93
|
+
|
|
94
|
+
for grader in step_config.graders:
|
|
95
|
+
# Select target field
|
|
96
|
+
target_text = ""
|
|
97
|
+
if grader.field == "public_output":
|
|
98
|
+
target_text = result_obj.public_output
|
|
99
|
+
elif grader.field == "private_thought":
|
|
100
|
+
target_text = result_obj.private_thought
|
|
101
|
+
|
|
102
|
+
passed = False
|
|
103
|
+
reasoning = ""
|
|
104
|
+
score = 0.0
|
|
105
|
+
|
|
106
|
+
# Router
|
|
107
|
+
if grader.type == "text_match":
|
|
108
|
+
passed, reasoning = check_text_match(grader, target_text or "")
|
|
109
|
+
score = 1.0 if passed else 0.0
|
|
110
|
+
|
|
111
|
+
elif grader.type == "llm_judge":
|
|
112
|
+
passed, reasoning, score = check_llm_judge(grader, target_text or "")
|
|
113
|
+
elif grader.type == "tool_usage":
|
|
114
|
+
passed, reasoning = check_tool_usage(grader, getattr(result_obj, "tool_calls", None) or [])
|
|
115
|
+
score = 1.0 if passed else 0.0
|
|
116
|
+
|
|
117
|
+
check_results.append({
|
|
118
|
+
"type": grader.type,
|
|
119
|
+
"field": grader.field,
|
|
120
|
+
"passed": passed,
|
|
121
|
+
"score": score,
|
|
122
|
+
"reasoning": reasoning
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
return check_results
|
|
126
|
+
|
|
127
|
+
def check_tool_usage(grader: GraderConfig, tool_calls: List[Dict[str, Any]]) -> Tuple[bool, str]:
|
|
128
|
+
"""Checks the JSON-RPC result.tool_calls list for a matching tool invocation.
|
|
129
|
+
|
|
130
|
+
Expectations come from the manifest:
|
|
131
|
+
- grader.tool_name: required exact match against call name (if provided)
|
|
132
|
+
- grader.arguments: all provided key/value pairs must be present in call arguments
|
|
133
|
+
"""
|
|
134
|
+
if not isinstance(tool_calls, list) or not tool_calls:
|
|
135
|
+
return False, "No tool_calls present. Agent did not report any tool usage."
|
|
136
|
+
|
|
137
|
+
expected_name = grader.tool_name
|
|
138
|
+
expected_args = grader.arguments or {}
|
|
139
|
+
|
|
140
|
+
def args_match(actual: Dict[str, Any], expected: Dict[str, Any]) -> bool:
|
|
141
|
+
for k, v in expected.items():
|
|
142
|
+
if k not in actual:
|
|
143
|
+
return False
|
|
144
|
+
if actual[k] != v:
|
|
145
|
+
return False
|
|
146
|
+
return True
|
|
147
|
+
|
|
148
|
+
for call in tool_calls:
|
|
149
|
+
name = call.get("name") or call.get("tool") or call.get("id")
|
|
150
|
+
args = call.get("arguments") or call.get("args") or {}
|
|
151
|
+
|
|
152
|
+
# Name must match if specified
|
|
153
|
+
if expected_name and name != expected_name:
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
# Arguments must include expected pairs
|
|
157
|
+
if args_match(args if isinstance(args, dict) else {}, expected_args):
|
|
158
|
+
return True, f"Found tool call '{name}' with expected arguments"
|
|
159
|
+
|
|
160
|
+
available = [call.get("name") or call.get("tool") or call.get("id") for call in tool_calls]
|
|
161
|
+
reason = "No matching tool call"
|
|
162
|
+
if expected_name:
|
|
163
|
+
reason += f" for name='{expected_name}'"
|
|
164
|
+
if expected_args:
|
|
165
|
+
reason += f" and arguments={expected_args}"
|
|
166
|
+
if available:
|
|
167
|
+
reason += f". Calls seen: {available}"
|
|
168
|
+
return False, reason
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from typing import List, Optional, Dict, Any, Union
|
|
2
|
+
from pydantic import BaseModel, Field
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
# --- The Grader (Assertion) Schema ---
|
|
6
|
+
class GraderConfig(BaseModel):
|
|
7
|
+
type: str
|
|
8
|
+
field: str = "public_output"
|
|
9
|
+
# For text_match
|
|
10
|
+
condition: Optional[str] = None
|
|
11
|
+
value: Optional[str] = None
|
|
12
|
+
pattern: Optional[str] = None
|
|
13
|
+
# For llm_judge
|
|
14
|
+
prompt: Optional[str] = None
|
|
15
|
+
assertion: Optional[str] = None
|
|
16
|
+
# For tool_usage
|
|
17
|
+
tool_name: Optional[str] = None
|
|
18
|
+
arguments: Dict[str, Any] = Field(default_factory=dict)
|
|
19
|
+
|
|
20
|
+
# --- The Step (Scenario) Schema ---
|
|
21
|
+
class StepConfig(BaseModel):
|
|
22
|
+
input: str
|
|
23
|
+
constraints: Dict[str, Any] = Field(default_factory=dict)
|
|
24
|
+
graders: List[GraderConfig] = Field(default_factory=list)
|
|
25
|
+
|
|
26
|
+
class ScenarioConfig(BaseModel):
|
|
27
|
+
name: str
|
|
28
|
+
steps: List[StepConfig]
|
|
29
|
+
|
|
30
|
+
# --- The Root Manifest Schema ---
|
|
31
|
+
class ECPManifest(BaseModel):
|
|
32
|
+
manifest_version: str = "v1"
|
|
33
|
+
name: str
|
|
34
|
+
target: str
|
|
35
|
+
scenarios: List[ScenarioConfig]
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def from_yaml(cls, path: str) -> "ECPManifest":
|
|
39
|
+
with open(path, "r") as f:
|
|
40
|
+
data = yaml.safe_load(f)
|
|
41
|
+
return cls(**data)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Dict, Any
|
|
5
|
+
from jinja2 import Template
|
|
6
|
+
|
|
7
|
+
HTML_TEMPLATE = """
|
|
8
|
+
<!DOCTYPE html>
|
|
9
|
+
<html>
|
|
10
|
+
<head>
|
|
11
|
+
<title>ECP Evaluation Report</title>
|
|
12
|
+
<style>
|
|
13
|
+
body { font-family: sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; background: #f4f4f9; }
|
|
14
|
+
.card { background: white; padding: 20px; margin-bottom: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
|
|
15
|
+
.pass { color: green; font-weight: bold; }
|
|
16
|
+
.fail { color: red; font-weight: bold; }
|
|
17
|
+
h1 { border-bottom: 2px solid #ddd; padding-bottom: 10px; }
|
|
18
|
+
.meta { color: #666; font-size: 0.9em; }
|
|
19
|
+
pre { background: #eee; padding: 10px; border-radius: 4px; overflow-x: auto; }
|
|
20
|
+
</style>
|
|
21
|
+
</head>
|
|
22
|
+
<body>
|
|
23
|
+
<h1>ECP Evaluation Report</h1>
|
|
24
|
+
<div class="meta">Generated: {{ timestamp }}</div>
|
|
25
|
+
|
|
26
|
+
{% for scenario in results %}
|
|
27
|
+
<div class="card">
|
|
28
|
+
<h2>Scenario: {{ scenario.name }}</h2>
|
|
29
|
+
{% for step in scenario.steps %}
|
|
30
|
+
<div style="border-top: 1px solid #eee; margin-top: 10px; padding-top: 10px;">
|
|
31
|
+
<p><strong>Input:</strong> {{ step.input }}</p>
|
|
32
|
+
<p><strong>Output:</strong> {{ step.output }}</p>
|
|
33
|
+
|
|
34
|
+
<h4>Graders:</h4>
|
|
35
|
+
<ul>
|
|
36
|
+
{% for check in step.checks %}
|
|
37
|
+
<li>
|
|
38
|
+
<span class="{{ 'pass' if check.passed else 'fail' }}">
|
|
39
|
+
{{ '✅ PASS' if check.passed else '❌ FAIL' }}
|
|
40
|
+
</span>
|
|
41
|
+
{{ check.type }}
|
|
42
|
+
{% if check.reasoning %}
|
|
43
|
+
<br><small>Reason: {{ check.reasoning }}</small>
|
|
44
|
+
{% endif %}
|
|
45
|
+
</li>
|
|
46
|
+
{% endfor %}
|
|
47
|
+
</ul>
|
|
48
|
+
</div>
|
|
49
|
+
{% endfor %}
|
|
50
|
+
</div>
|
|
51
|
+
{% endfor %}
|
|
52
|
+
</body>
|
|
53
|
+
</html>
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
class HTMLReporter:
|
|
57
|
+
def __init__(self):
|
|
58
|
+
self.results = []
|
|
59
|
+
|
|
60
|
+
def add_scenario(self, name: str, steps: List[Dict[str, Any]]):
|
|
61
|
+
self.results.append({"name": name, "steps": steps})
|
|
62
|
+
|
|
63
|
+
def save(self, filepath: str):
|
|
64
|
+
template = Template(HTML_TEMPLATE)
|
|
65
|
+
html_content = template.render(
|
|
66
|
+
results=self.results,
|
|
67
|
+
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
68
|
+
)
|
|
69
|
+
Path(filepath).write_text(html_content, encoding="utf-8")
|
|
70
|
+
return filepath
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Docstring for runtime.python.src.ecp_runtime.runner
|
|
3
|
+
|
|
4
|
+
Simplified Version. V0.1
|
|
5
|
+
AsyncIO Pending
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import subprocess
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
import time
|
|
12
|
+
import logging
|
|
13
|
+
import threading
|
|
14
|
+
import queue
|
|
15
|
+
from typing import Dict, Any, Optional, List
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from .graders import evaluate_step
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class StepResult:
|
|
24
|
+
status: str
|
|
25
|
+
public_output: Optional[str] = None
|
|
26
|
+
private_thought: Optional[str] = None
|
|
27
|
+
logs: Optional[str] = None
|
|
28
|
+
tool_calls: Optional[List[Dict[str, Any]]] = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class AgentProcess:
|
|
32
|
+
"""Manages the lifecycle of the Agent Child Process."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, command: str, rpc_timeout: float = 30.0):
|
|
35
|
+
self.command = command
|
|
36
|
+
self.rpc_timeout = rpc_timeout
|
|
37
|
+
self.process = None
|
|
38
|
+
|
|
39
|
+
def start(self):
|
|
40
|
+
# Launch the agent and connect pipes to stdio
|
|
41
|
+
self.process = subprocess.Popen(
|
|
42
|
+
self.command,
|
|
43
|
+
shell=True,
|
|
44
|
+
stdin=subprocess.PIPE,
|
|
45
|
+
stdout=subprocess.PIPE,
|
|
46
|
+
stderr=subprocess.PIPE,
|
|
47
|
+
text=True,
|
|
48
|
+
bufsize=1 # Line buffered
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def stop(self):
|
|
52
|
+
if self.process:
|
|
53
|
+
self.process.terminate()
|
|
54
|
+
|
|
55
|
+
def send_rpc(self, method: str, params: Dict[str, Any] = None) -> Dict[str, Any]:
|
|
56
|
+
"""Sends a JSON-RPC request and waits for the response."""
|
|
57
|
+
if not params:
|
|
58
|
+
params = {}
|
|
59
|
+
|
|
60
|
+
request = {
|
|
61
|
+
"jsonrpc": "2.0",
|
|
62
|
+
"method": method,
|
|
63
|
+
"params": params,
|
|
64
|
+
"id": int(time.time() * 1000)
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
# Write to Agent's STDIN
|
|
68
|
+
json_str = json.dumps(request)
|
|
69
|
+
self.process.stdin.write(json_str + "\n")
|
|
70
|
+
self.process.stdin.flush()
|
|
71
|
+
|
|
72
|
+
return self._read_json_response()
|
|
73
|
+
|
|
74
|
+
def _read_json_response(self) -> Dict[str, Any]:
|
|
75
|
+
start_time = time.time()
|
|
76
|
+
last_non_json = None
|
|
77
|
+
|
|
78
|
+
while True:
|
|
79
|
+
elapsed = time.time() - start_time
|
|
80
|
+
remaining = max(self.rpc_timeout - elapsed, 0)
|
|
81
|
+
if remaining <= 0:
|
|
82
|
+
stderr = self._safe_read_stderr()
|
|
83
|
+
raise RuntimeError(
|
|
84
|
+
f"Agent response timed out after {self.rpc_timeout:.1f}s. "
|
|
85
|
+
f"Last non-JSON line: {last_non_json}. Stderr: {stderr}"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
response_line = self._readline_with_timeout(remaining)
|
|
89
|
+
if response_line is None:
|
|
90
|
+
stderr = self._safe_read_stderr()
|
|
91
|
+
raise RuntimeError(
|
|
92
|
+
f"Agent response timed out after {self.rpc_timeout:.1f}s. "
|
|
93
|
+
f"Last non-JSON line: {last_non_json}. Stderr: {stderr}"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
if response_line == "":
|
|
97
|
+
stderr = self._safe_read_stderr()
|
|
98
|
+
raise RuntimeError(f"Agent crashed or closed connection. Stderr: {stderr}")
|
|
99
|
+
|
|
100
|
+
line = response_line.strip()
|
|
101
|
+
if not line:
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
return json.loads(line)
|
|
106
|
+
except json.JSONDecodeError:
|
|
107
|
+
last_non_json = line
|
|
108
|
+
logger.warning("Agent emitted non-JSON stdout: %s", line)
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
def _readline_with_timeout(self, timeout: float) -> Optional[str]:
|
|
112
|
+
if not self.process or not self.process.stdout:
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
q: queue.Queue = queue.Queue(maxsize=1)
|
|
116
|
+
|
|
117
|
+
def _reader():
|
|
118
|
+
try:
|
|
119
|
+
q.put(self.process.stdout.readline())
|
|
120
|
+
except Exception:
|
|
121
|
+
q.put("")
|
|
122
|
+
|
|
123
|
+
t = threading.Thread(target=_reader, daemon=True)
|
|
124
|
+
t.start()
|
|
125
|
+
try:
|
|
126
|
+
return q.get(timeout=timeout)
|
|
127
|
+
except queue.Empty:
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
def _safe_read_stderr(self) -> str:
|
|
131
|
+
if not self.process or not self.process.stderr:
|
|
132
|
+
return ""
|
|
133
|
+
if self.process.poll() is None:
|
|
134
|
+
return ""
|
|
135
|
+
try:
|
|
136
|
+
return self.process.stderr.read()
|
|
137
|
+
except Exception:
|
|
138
|
+
return ""
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class ECPRunner:
|
|
142
|
+
"""The Orchestrator."""
|
|
143
|
+
|
|
144
|
+
def __init__(self, manifest):
|
|
145
|
+
self.manifest = manifest
|
|
146
|
+
|
|
147
|
+
def run_scenarios(self):
|
|
148
|
+
total_passed = 0
|
|
149
|
+
total_checks = 0
|
|
150
|
+
report_data: List[Dict[str, Any]] = []
|
|
151
|
+
|
|
152
|
+
for scenario in self.manifest.scenarios:
|
|
153
|
+
logger.info("Scenario: %s", scenario.name)
|
|
154
|
+
|
|
155
|
+
rpc_timeout = float(os.environ.get("ECP_RPC_TIMEOUT", "30"))
|
|
156
|
+
agent = AgentProcess(self.manifest.target, rpc_timeout=rpc_timeout)
|
|
157
|
+
agent.start()
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
agent.send_rpc("agent/initialize", {"config": {}})
|
|
161
|
+
scenario_steps: List[Dict[str, Any]] = []
|
|
162
|
+
|
|
163
|
+
for i, step in enumerate(scenario.steps):
|
|
164
|
+
# Execute
|
|
165
|
+
rpc_resp = agent.send_rpc("agent/step", {"input": step.input})
|
|
166
|
+
result_data = rpc_resp.get("result", {})
|
|
167
|
+
|
|
168
|
+
# Map to internal object
|
|
169
|
+
step_result = StepResult(
|
|
170
|
+
status=result_data.get("status", "done"),
|
|
171
|
+
public_output=result_data.get("public_output"),
|
|
172
|
+
private_thought=result_data.get("private_thought"),
|
|
173
|
+
tool_calls=result_data.get("tool_calls") if isinstance(result_data.get("tool_calls"), list) else None
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
logger.info("Step %d: Input='%s'", i + 1, step.input)
|
|
177
|
+
logger.info("Output: %s", step_result.public_output)
|
|
178
|
+
if step_result.private_thought:
|
|
179
|
+
logger.debug("Thought: %s", step_result.private_thought)
|
|
180
|
+
|
|
181
|
+
checks = evaluate_step(step, step_result)
|
|
182
|
+
|
|
183
|
+
for check in checks:
|
|
184
|
+
total_checks += 1
|
|
185
|
+
status = "PASS" if check["passed"] else "FAIL"
|
|
186
|
+
logger.info("%s | %s on %s", status, check["type"], check["field"])
|
|
187
|
+
|
|
188
|
+
if check["type"] == "llm_judge" or not check["passed"]:
|
|
189
|
+
logger.info("Reason: %s", check["reasoning"])
|
|
190
|
+
|
|
191
|
+
if check["passed"]:
|
|
192
|
+
total_passed += 1
|
|
193
|
+
|
|
194
|
+
# Collect for HTML report
|
|
195
|
+
scenario_steps.append({
|
|
196
|
+
"input": step.input,
|
|
197
|
+
"output": step_result.public_output,
|
|
198
|
+
"checks": checks
|
|
199
|
+
})
|
|
200
|
+
|
|
201
|
+
finally:
|
|
202
|
+
agent.stop()
|
|
203
|
+
|
|
204
|
+
# Append scenario block
|
|
205
|
+
report_data.append({"name": scenario.name, "steps": scenario_steps})
|
|
206
|
+
|
|
207
|
+
logger.info("Run Complete. Passed: %d/%d", total_passed, total_checks)
|
|
208
|
+
|
|
209
|
+
# Return structured report data
|
|
210
|
+
return {
|
|
211
|
+
"passed": total_passed,
|
|
212
|
+
"total": total_checks,
|
|
213
|
+
"scenarios": report_data
|
|
214
|
+
}
|