agent-eval-contract 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_eval_contract-0.2.0/CHANGELOG.md +11 -0
- agent_eval_contract-0.2.0/LICENSE +21 -0
- agent_eval_contract-0.2.0/MANIFEST.in +7 -0
- agent_eval_contract-0.2.0/PKG-INFO +107 -0
- agent_eval_contract-0.2.0/README.md +82 -0
- agent_eval_contract-0.2.0/RELEASE.md +53 -0
- agent_eval_contract-0.2.0/agent_eval_contract/__init__.py +104 -0
- agent_eval_contract-0.2.0/agent_eval_contract/clean_room.py +27 -0
- agent_eval_contract-0.2.0/agent_eval_contract/cli.py +114 -0
- agent_eval_contract-0.2.0/agent_eval_contract/external.py +215 -0
- agent_eval_contract-0.2.0/agent_eval_contract/fixture_runner.py +82 -0
- agent_eval_contract-0.2.0/agent_eval_contract/models.py +157 -0
- agent_eval_contract-0.2.0/agent_eval_contract/py.typed +1 -0
- agent_eval_contract-0.2.0/agent_eval_contract/release.py +52 -0
- agent_eval_contract-0.2.0/agent_eval_contract/release_metadata.json +39 -0
- agent_eval_contract-0.2.0/agent_eval_contract/samples/eval_failure.json +22 -0
- agent_eval_contract-0.2.0/agent_eval_contract/samples/eval_run.json +25 -0
- agent_eval_contract-0.2.0/agent_eval_contract/samples/eval_score.json +16 -0
- agent_eval_contract-0.2.0/agent_eval_contract/samples/eval_task.json +23 -0
- agent_eval_contract-0.2.0/agent_eval_contract/samples/external_result_normalization.json +39 -0
- agent_eval_contract-0.2.0/agent_eval_contract/samples.py +81 -0
- agent_eval_contract-0.2.0/agent_eval_contract/schema_export.py +42 -0
- agent_eval_contract-0.2.0/agent_eval_contract/schemas.py +47 -0
- agent_eval_contract-0.2.0/agent_eval_contract/templates.py +101 -0
- agent_eval_contract-0.2.0/agent_eval_contract/validators.py +116 -0
- agent_eval_contract-0.2.0/agent_eval_contract.egg-info/PKG-INFO +107 -0
- agent_eval_contract-0.2.0/agent_eval_contract.egg-info/SOURCES.txt +38 -0
- agent_eval_contract-0.2.0/agent_eval_contract.egg-info/dependency_links.txt +1 -0
- agent_eval_contract-0.2.0/agent_eval_contract.egg-info/entry_points.txt +3 -0
- agent_eval_contract-0.2.0/agent_eval_contract.egg-info/requires.txt +1 -0
- agent_eval_contract-0.2.0/agent_eval_contract.egg-info/top_level.txt +1 -0
- agent_eval_contract-0.2.0/docs/adapters.md +56 -0
- agent_eval_contract-0.2.0/docs/contract.md +45 -0
- agent_eval_contract-0.2.0/docs/field-reference.md +73 -0
- agent_eval_contract-0.2.0/examples/eval_run.json +15 -0
- agent_eval_contract-0.2.0/examples/swe_bench_result.json +13 -0
- agent_eval_contract-0.2.0/examples/terminal_bench_result.json +8 -0
- agent_eval_contract-0.2.0/pyproject.toml +73 -0
- agent_eval_contract-0.2.0/setup.cfg +4 -0
- agent_eval_contract-0.2.0/tests/test_agent_eval_contract.py +364 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.2.0 - 2026-07-04
|
|
4
|
+
|
|
5
|
+
- Reworked the package into a public Pydantic contract library for agent evaluation records.
|
|
6
|
+
- Added typed models, runtime validators, JSON Schema export, CLI subcommands, public examples, and fixture bundles.
|
|
7
|
+
- Removed private workflow vocabulary from the public core.
|
|
8
|
+
|
|
9
|
+
## 0.1.0 - 2026-06-27
|
|
10
|
+
|
|
11
|
+
- Initial internal package extraction.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jakye Amos
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agent-eval-contract
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Pydantic contracts and JSON Schemas for portable agent evaluation records.
|
|
5
|
+
Author: Jakye Amos
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jakyeamos/agent-eval-contract
|
|
8
|
+
Project-URL: Repository, https://github.com/jakyeamos/agent-eval-contract
|
|
9
|
+
Project-URL: Issues, https://github.com/jakyeamos/agent-eval-contract/issues
|
|
10
|
+
Keywords: agents,ai-evaluation,evals,harness,json-schema,pydantic
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
17
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
18
|
+
Classifier: Topic :: Software Development :: Testing
|
|
19
|
+
Classifier: Typing :: Typed
|
|
20
|
+
Requires-Python: >=3.12
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: pydantic<3,>=2
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# Agent Eval Contract
|
|
27
|
+
|
|
28
|
+
Pydantic contracts and JSON Schemas for portable agent evaluation records.
|
|
29
|
+
|
|
30
|
+
Use this package when you are experimenting with agents, harnesses, CI checks, or benchmark runners and need a stable record shape for tasks, runs, scores, failures, and normalized external results. It does not run evaluations, call model providers, store dashboards, or orchestrate agents. It gives those tools a shared contract.
|
|
31
|
+
|
|
32
|
+
## Install
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install agent-eval-contract
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
For local development from this repo:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
uv sync --dev
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Validate A Record
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from agent_eval_contract import validate_eval_run
|
|
48
|
+
|
|
49
|
+
run = validate_eval_run(
|
|
50
|
+
{
|
|
51
|
+
"run_id": "run-login-flow-001",
|
|
52
|
+
"task_id": "task-login-flow-001",
|
|
53
|
+
"harness": "pytest",
|
|
54
|
+
"model": "gpt-5",
|
|
55
|
+
"mode": "autonomous",
|
|
56
|
+
"context_profile": "repo_only",
|
|
57
|
+
"final_status": "success",
|
|
58
|
+
"checks": ["pytest tests/test_auth_redirect.py -q"],
|
|
59
|
+
}
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
print(run.model_dump(mode="json"))
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Validation returns typed Pydantic model instances. Invalid records raise `pydantic.ValidationError` with structured field errors.
|
|
66
|
+
|
|
67
|
+
## CLI
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
agent-eval-contract validate --kind run --file examples/eval_run.json
|
|
71
|
+
agent-eval-contract schemas --output-dir /tmp/agent-eval-contract-schemas
|
|
72
|
+
agent-eval-contract fixtures --output-dir /tmp/agent-eval-contract-fixtures
|
|
73
|
+
agent-eval-contract normalize --harness terminal-bench --file examples/terminal_bench_result.json --task-id task-login-flow-001 --model gpt-5
|
|
74
|
+
agent-eval-contract normalize --harness swe-bench --file examples/swe_bench_result.json
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
The legacy `agent-eval-contract-fixtures` command still writes fixture bundles for one release.
|
|
78
|
+
|
|
79
|
+
## What It Provides
|
|
80
|
+
|
|
81
|
+
- Pydantic models for eval tasks, runs, scores, failures, external results, normalized runs, and fixture manifests
|
|
82
|
+
- runtime validators that return typed model instances
|
|
83
|
+
- JSON Schema export for all public models
|
|
84
|
+
- bundled sample records and markdown templates
|
|
85
|
+
- Terminal-Bench and SWE-bench oriented normalization helpers
|
|
86
|
+
- a small CLI for validation, schema export, fixture generation, and normalization
|
|
87
|
+
|
|
88
|
+
## Contract Vocabulary
|
|
89
|
+
|
|
90
|
+
The public core uses generic vocabulary only. Project-specific concepts should live in `metadata` or a separate adapter package.
|
|
91
|
+
|
|
92
|
+
- `context_profile`: `repo_only`, `provided_context`, `clean_room`, `tool_augmented`, `full_workspace`
|
|
93
|
+
- `source`: `manual`, `ci`, `benchmark`, `production_trace`, `synthetic`
|
|
94
|
+
- `mode`: `interactive`, `autonomous`, `shadow`, `replay`, `benchmark`
|
|
95
|
+
- `final_status`: `success`, `partial`, `failed`, `abandoned`, `error`
|
|
96
|
+
|
|
97
|
+
See [docs/contract.md](docs/contract.md), [docs/field-reference.md](docs/field-reference.md), and [docs/adapters.md](docs/adapters.md) for the model contract and adapter guidance.
|
|
98
|
+
|
|
99
|
+
## Development
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
uv run ruff check agent_eval_contract tests
|
|
103
|
+
uv run ruff format --check agent_eval_contract tests
|
|
104
|
+
uv run basedpyright agent_eval_contract tests
|
|
105
|
+
uv run pytest -q
|
|
106
|
+
uv build --out-dir /tmp/agent-eval-contract-dist
|
|
107
|
+
```
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Agent Eval Contract
|
|
2
|
+
|
|
3
|
+
Pydantic contracts and JSON Schemas for portable agent evaluation records.
|
|
4
|
+
|
|
5
|
+
Use this package when you are experimenting with agents, harnesses, CI checks, or benchmark runners and need a stable record shape for tasks, runs, scores, failures, and normalized external results. It does not run evaluations, call model providers, store dashboards, or orchestrate agents. It gives those tools a shared contract.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install agent-eval-contract
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
For local development from this repo:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
uv sync --dev
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Validate A Record
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from agent_eval_contract import validate_eval_run
|
|
23
|
+
|
|
24
|
+
run = validate_eval_run(
|
|
25
|
+
{
|
|
26
|
+
"run_id": "run-login-flow-001",
|
|
27
|
+
"task_id": "task-login-flow-001",
|
|
28
|
+
"harness": "pytest",
|
|
29
|
+
"model": "gpt-5",
|
|
30
|
+
"mode": "autonomous",
|
|
31
|
+
"context_profile": "repo_only",
|
|
32
|
+
"final_status": "success",
|
|
33
|
+
"checks": ["pytest tests/test_auth_redirect.py -q"],
|
|
34
|
+
}
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
print(run.model_dump(mode="json"))
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Validation returns typed Pydantic model instances. Invalid records raise `pydantic.ValidationError` with structured field errors.
|
|
41
|
+
|
|
42
|
+
## CLI
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
agent-eval-contract validate --kind run --file examples/eval_run.json
|
|
46
|
+
agent-eval-contract schemas --output-dir /tmp/agent-eval-contract-schemas
|
|
47
|
+
agent-eval-contract fixtures --output-dir /tmp/agent-eval-contract-fixtures
|
|
48
|
+
agent-eval-contract normalize --harness terminal-bench --file examples/terminal_bench_result.json --task-id task-login-flow-001 --model gpt-5
|
|
49
|
+
agent-eval-contract normalize --harness swe-bench --file examples/swe_bench_result.json
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
The legacy `agent-eval-contract-fixtures` command still writes fixture bundles for one release.
|
|
53
|
+
|
|
54
|
+
## What It Provides
|
|
55
|
+
|
|
56
|
+
- Pydantic models for eval tasks, runs, scores, failures, external results, normalized runs, and fixture manifests
|
|
57
|
+
- runtime validators that return typed model instances
|
|
58
|
+
- JSON Schema export for all public models
|
|
59
|
+
- bundled sample records and markdown templates
|
|
60
|
+
- Terminal-Bench and SWE-bench oriented normalization helpers
|
|
61
|
+
- a small CLI for validation, schema export, fixture generation, and normalization
|
|
62
|
+
|
|
63
|
+
## Contract Vocabulary
|
|
64
|
+
|
|
65
|
+
The public core uses generic vocabulary only. Project-specific concepts should live in `metadata` or a separate adapter package.
|
|
66
|
+
|
|
67
|
+
- `context_profile`: `repo_only`, `provided_context`, `clean_room`, `tool_augmented`, `full_workspace`
|
|
68
|
+
- `source`: `manual`, `ci`, `benchmark`, `production_trace`, `synthetic`
|
|
69
|
+
- `mode`: `interactive`, `autonomous`, `shadow`, `replay`, `benchmark`
|
|
70
|
+
- `final_status`: `success`, `partial`, `failed`, `abandoned`, `error`
|
|
71
|
+
|
|
72
|
+
See [docs/contract.md](docs/contract.md), [docs/field-reference.md](docs/field-reference.md), and [docs/adapters.md](docs/adapters.md) for the model contract and adapter guidance.
|
|
73
|
+
|
|
74
|
+
## Development
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
uv run ruff check agent_eval_contract tests
|
|
78
|
+
uv run ruff format --check agent_eval_contract tests
|
|
79
|
+
uv run basedpyright agent_eval_contract tests
|
|
80
|
+
uv run pytest -q
|
|
81
|
+
uv build --out-dir /tmp/agent-eval-contract-dist
|
|
82
|
+
```
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Release
|
|
2
|
+
|
|
3
|
+
## Current Version
|
|
4
|
+
|
|
5
|
+
- Version: `0.2.0`
|
|
6
|
+
- Contract version: `0.1`
|
|
7
|
+
- Status: public package release
|
|
8
|
+
- Source package: `agent_eval_contract`
|
|
9
|
+
|
|
10
|
+
## Public Promise
|
|
11
|
+
|
|
12
|
+
`agent-eval-contract` defines, validates, serializes, exports JSON Schema for, and normalizes portable agent evaluation records.
|
|
13
|
+
|
|
14
|
+
## Release Checks
|
|
15
|
+
|
|
16
|
+
CI runs the same quality ladder on pull requests and pushes. Run this local block before tagging or publishing:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
uv run ruff check agent_eval_contract tests
|
|
20
|
+
uv run ruff format --check agent_eval_contract tests
|
|
21
|
+
uv run basedpyright agent_eval_contract tests
|
|
22
|
+
uv run pytest -q
|
|
23
|
+
uv build --out-dir /tmp/agent-eval-contract-dist
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Then install the wheel in a temp virtualenv and smoke test:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
python -m venv /tmp/agent-eval-contract-venv
|
|
30
|
+
/tmp/agent-eval-contract-venv/bin/pip install /tmp/agent-eval-contract-dist/agent_eval_contract-0.2.0-py3-none-any.whl
|
|
31
|
+
/tmp/agent-eval-contract-venv/bin/agent-eval-contract validate --kind run --file examples/eval_run.json
|
|
32
|
+
/tmp/agent-eval-contract-venv/bin/agent-eval-contract schemas --output-dir /tmp/agent-eval-contract-schemas
|
|
33
|
+
/tmp/agent-eval-contract-venv/bin/agent-eval-contract normalize --harness swe-bench --file examples/swe_bench_result.json
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Boundaries
|
|
37
|
+
|
|
38
|
+
Public core:
|
|
39
|
+
|
|
40
|
+
- Pydantic record models
|
|
41
|
+
- runtime validation helpers
|
|
42
|
+
- JSON Schema export
|
|
43
|
+
- external harness normalization
|
|
44
|
+
- fixture bundle generation
|
|
45
|
+
- CLI validation and schema export
|
|
46
|
+
|
|
47
|
+
Out of scope:
|
|
48
|
+
|
|
49
|
+
- evaluation execution
|
|
50
|
+
- model provider calls
|
|
51
|
+
- dashboard storage
|
|
52
|
+
- private workflow vocabulary
|
|
53
|
+
- agent orchestration
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .clean_room import run_clean_room_contract_check
|
|
4
|
+
from .external import (
|
|
5
|
+
normalize_external_result,
|
|
6
|
+
normalize_swe_bench_result,
|
|
7
|
+
normalize_terminal_bench_result,
|
|
8
|
+
to_swe_bench_format,
|
|
9
|
+
to_terminal_bench_format,
|
|
10
|
+
)
|
|
11
|
+
from .models import (
|
|
12
|
+
CONTEXT_PROFILES,
|
|
13
|
+
EVAL_RUN_MODES,
|
|
14
|
+
EVAL_TASK_SOURCES,
|
|
15
|
+
EXTERNAL_HARNESSES,
|
|
16
|
+
FAILURE_PRIORITIES,
|
|
17
|
+
FINAL_STATUSES,
|
|
18
|
+
ContextProfile,
|
|
19
|
+
EvalFailure,
|
|
20
|
+
EvalRun,
|
|
21
|
+
EvalRunMode,
|
|
22
|
+
EvalScore,
|
|
23
|
+
EvalTask,
|
|
24
|
+
EvalTaskSource,
|
|
25
|
+
ExternalHarness,
|
|
26
|
+
ExternalResult,
|
|
27
|
+
FailurePriority,
|
|
28
|
+
FinalStatus,
|
|
29
|
+
FixtureBundleManifest,
|
|
30
|
+
JsonValue,
|
|
31
|
+
NormalizedRun,
|
|
32
|
+
)
|
|
33
|
+
from .release import load_release_metadata, validate_release_metadata
|
|
34
|
+
from .samples import load_sample, validate_all_samples, validate_sample
|
|
35
|
+
from .schema_export import export_json_schemas
|
|
36
|
+
from .templates import (
|
|
37
|
+
render_eval_template,
|
|
38
|
+
supported_template_ids,
|
|
39
|
+
validate_eval_template,
|
|
40
|
+
validate_eval_template_file,
|
|
41
|
+
validate_template_directory,
|
|
42
|
+
)
|
|
43
|
+
from .validators import (
|
|
44
|
+
HARNESS_DIMENSION_NAMES,
|
|
45
|
+
validate_context_profile,
|
|
46
|
+
validate_eval_failure,
|
|
47
|
+
validate_eval_run,
|
|
48
|
+
validate_eval_score,
|
|
49
|
+
validate_eval_task,
|
|
50
|
+
validate_external_result,
|
|
51
|
+
validate_final_status,
|
|
52
|
+
validate_harness_fixture_components,
|
|
53
|
+
validate_priority,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
__all__ = [
|
|
57
|
+
"CONTEXT_PROFILES",
|
|
58
|
+
"EVAL_RUN_MODES",
|
|
59
|
+
"EVAL_TASK_SOURCES",
|
|
60
|
+
"EXTERNAL_HARNESSES",
|
|
61
|
+
"FAILURE_PRIORITIES",
|
|
62
|
+
"FINAL_STATUSES",
|
|
63
|
+
"HARNESS_DIMENSION_NAMES",
|
|
64
|
+
"ContextProfile",
|
|
65
|
+
"EvalFailure",
|
|
66
|
+
"EvalRun",
|
|
67
|
+
"EvalRunMode",
|
|
68
|
+
"EvalScore",
|
|
69
|
+
"EvalTask",
|
|
70
|
+
"EvalTaskSource",
|
|
71
|
+
"ExternalHarness",
|
|
72
|
+
"ExternalResult",
|
|
73
|
+
"FailurePriority",
|
|
74
|
+
"FinalStatus",
|
|
75
|
+
"FixtureBundleManifest",
|
|
76
|
+
"JsonValue",
|
|
77
|
+
"NormalizedRun",
|
|
78
|
+
"export_json_schemas",
|
|
79
|
+
"load_release_metadata",
|
|
80
|
+
"load_sample",
|
|
81
|
+
"normalize_external_result",
|
|
82
|
+
"normalize_swe_bench_result",
|
|
83
|
+
"normalize_terminal_bench_result",
|
|
84
|
+
"render_eval_template",
|
|
85
|
+
"run_clean_room_contract_check",
|
|
86
|
+
"supported_template_ids",
|
|
87
|
+
"to_swe_bench_format",
|
|
88
|
+
"to_terminal_bench_format",
|
|
89
|
+
"validate_all_samples",
|
|
90
|
+
"validate_context_profile",
|
|
91
|
+
"validate_eval_failure",
|
|
92
|
+
"validate_eval_run",
|
|
93
|
+
"validate_eval_score",
|
|
94
|
+
"validate_eval_task",
|
|
95
|
+
"validate_eval_template",
|
|
96
|
+
"validate_eval_template_file",
|
|
97
|
+
"validate_external_result",
|
|
98
|
+
"validate_final_status",
|
|
99
|
+
"validate_harness_fixture_components",
|
|
100
|
+
"validate_priority",
|
|
101
|
+
"validate_release_metadata",
|
|
102
|
+
"validate_sample",
|
|
103
|
+
"validate_template_directory",
|
|
104
|
+
]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from .samples import validate_all_samples
|
|
7
|
+
from .templates import validate_template_directory
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def run_clean_room_contract_check(
|
|
11
|
+
*,
|
|
12
|
+
template_root: Path,
|
|
13
|
+
sample_root: Path | None = None,
|
|
14
|
+
) -> dict[str, Any]:
|
|
15
|
+
templates = validate_template_directory(template_root)
|
|
16
|
+
samples = (
|
|
17
|
+
validate_all_samples()
|
|
18
|
+
if sample_root is None
|
|
19
|
+
else validate_all_samples(sample_root=sample_root)
|
|
20
|
+
)
|
|
21
|
+
return {
|
|
22
|
+
"ok": True,
|
|
23
|
+
"template_count": len(templates),
|
|
24
|
+
"sample_count": len(samples),
|
|
25
|
+
"templates": templates,
|
|
26
|
+
"samples": samples,
|
|
27
|
+
}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from .external import normalize_external_result
|
|
11
|
+
from .fixture_runner import write_contract_fixture_bundle
|
|
12
|
+
from .schema_export import export_json_schemas
|
|
13
|
+
from .validators import (
|
|
14
|
+
validate_eval_failure,
|
|
15
|
+
validate_eval_run,
|
|
16
|
+
validate_eval_score,
|
|
17
|
+
validate_eval_task,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _load_json(path: Path) -> dict[str, Any]:
|
|
22
|
+
loaded = json.loads(path.expanduser().resolve().read_text(encoding="utf-8"))
|
|
23
|
+
if not isinstance(loaded, dict):
|
|
24
|
+
raise ValueError(f"{path} must contain a JSON object")
|
|
25
|
+
return loaded
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _print_json(value: BaseModel | dict[str, Any] | list[str]) -> None:
|
|
29
|
+
payload = value.model_dump(mode="json") if isinstance(value, BaseModel) else value
|
|
30
|
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _run_fixtures(args: argparse.Namespace) -> int:
|
|
34
|
+
_print_json(write_contract_fixture_bundle(Path(args.output_dir)))
|
|
35
|
+
return 0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _run_schemas(args: argparse.Namespace) -> int:
|
|
39
|
+
_print_json({"schemas": export_json_schemas(Path(args.output_dir))})
|
|
40
|
+
return 0
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _run_validate(args: argparse.Namespace) -> int:
|
|
44
|
+
data = _load_json(Path(args.file))
|
|
45
|
+
validators = {
|
|
46
|
+
"task": validate_eval_task,
|
|
47
|
+
"run": validate_eval_run,
|
|
48
|
+
"score": validate_eval_score,
|
|
49
|
+
"failure": validate_eval_failure,
|
|
50
|
+
}
|
|
51
|
+
_print_json(validators[args.kind](data))
|
|
52
|
+
return 0
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _run_normalize(args: argparse.Namespace) -> int:
|
|
56
|
+
data = _load_json(Path(args.file))
|
|
57
|
+
normalized = normalize_external_result(
|
|
58
|
+
data,
|
|
59
|
+
eval_task_id=args.task_id,
|
|
60
|
+
harness=args.harness,
|
|
61
|
+
model=args.model,
|
|
62
|
+
)
|
|
63
|
+
_print_json(normalized)
|
|
64
|
+
return 0
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
68
|
+
parser = argparse.ArgumentParser(
|
|
69
|
+
prog="agent-eval-contract",
|
|
70
|
+
description="Validate, normalize, and export portable agent evaluation contracts.",
|
|
71
|
+
)
|
|
72
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
73
|
+
|
|
74
|
+
fixtures = subparsers.add_parser(
|
|
75
|
+
"fixtures", help="Write sample records, templates, and schemas."
|
|
76
|
+
)
|
|
77
|
+
fixtures.add_argument(
|
|
78
|
+
"--output-dir", required=True, help="Directory to write fixture artifacts into."
|
|
79
|
+
)
|
|
80
|
+
fixtures.set_defaults(func=_run_fixtures)
|
|
81
|
+
|
|
82
|
+
schemas = subparsers.add_parser(
|
|
83
|
+
"schemas", help="Export JSON Schemas for public contract models."
|
|
84
|
+
)
|
|
85
|
+
schemas.add_argument(
|
|
86
|
+
"--output-dir", required=True, help="Directory to write schema files into."
|
|
87
|
+
)
|
|
88
|
+
schemas.set_defaults(func=_run_schemas)
|
|
89
|
+
|
|
90
|
+
validate = subparsers.add_parser(
|
|
91
|
+
"validate", help="Validate a JSON record against a contract model."
|
|
92
|
+
)
|
|
93
|
+
validate.add_argument("--kind", choices=("task", "run", "score", "failure"), required=True)
|
|
94
|
+
validate.add_argument("--file", required=True, help="JSON file to validate.")
|
|
95
|
+
validate.set_defaults(func=_run_validate)
|
|
96
|
+
|
|
97
|
+
normalize = subparsers.add_parser("normalize", help="Normalize external harness output.")
|
|
98
|
+
normalize.add_argument("--harness", choices=("terminal-bench", "swe-bench"), required=True)
|
|
99
|
+
normalize.add_argument("--file", required=True, help="External result JSON file to normalize.")
|
|
100
|
+
normalize.add_argument("--task-id", help="Task id to use when the external result omits one.")
|
|
101
|
+
normalize.add_argument("--model", help="Model name to use when the external result omits one.")
|
|
102
|
+
normalize.set_defaults(func=_run_normalize)
|
|
103
|
+
|
|
104
|
+
return parser
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def main(argv: list[str] | None = None) -> int:
|
|
108
|
+
parser = build_parser()
|
|
109
|
+
args = parser.parse_args(argv)
|
|
110
|
+
return int(args.func(args))
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
if __name__ == "__main__":
|
|
114
|
+
raise SystemExit(main())
|