agent-eval-contract 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_eval_contract/__init__.py +104 -0
- agent_eval_contract/clean_room.py +27 -0
- agent_eval_contract/cli.py +114 -0
- agent_eval_contract/external.py +215 -0
- agent_eval_contract/fixture_runner.py +82 -0
- agent_eval_contract/models.py +157 -0
- agent_eval_contract/py.typed +1 -0
- agent_eval_contract/release.py +52 -0
- agent_eval_contract/release_metadata.json +39 -0
- agent_eval_contract/samples/eval_failure.json +22 -0
- agent_eval_contract/samples/eval_run.json +25 -0
- agent_eval_contract/samples/eval_score.json +16 -0
- agent_eval_contract/samples/eval_task.json +23 -0
- agent_eval_contract/samples/external_result_normalization.json +39 -0
- agent_eval_contract/samples.py +81 -0
- agent_eval_contract/schema_export.py +42 -0
- agent_eval_contract/schemas.py +47 -0
- agent_eval_contract/templates.py +101 -0
- agent_eval_contract/validators.py +116 -0
- agent_eval_contract-0.2.0.dist-info/METADATA +107 -0
- agent_eval_contract-0.2.0.dist-info/RECORD +25 -0
- agent_eval_contract-0.2.0.dist-info/WHEEL +5 -0
- agent_eval_contract-0.2.0.dist-info/entry_points.txt +3 -0
- agent_eval_contract-0.2.0.dist-info/licenses/LICENSE +21 -0
- agent_eval_contract-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .clean_room import run_clean_room_contract_check
|
|
4
|
+
from .external import (
|
|
5
|
+
normalize_external_result,
|
|
6
|
+
normalize_swe_bench_result,
|
|
7
|
+
normalize_terminal_bench_result,
|
|
8
|
+
to_swe_bench_format,
|
|
9
|
+
to_terminal_bench_format,
|
|
10
|
+
)
|
|
11
|
+
from .models import (
|
|
12
|
+
CONTEXT_PROFILES,
|
|
13
|
+
EVAL_RUN_MODES,
|
|
14
|
+
EVAL_TASK_SOURCES,
|
|
15
|
+
EXTERNAL_HARNESSES,
|
|
16
|
+
FAILURE_PRIORITIES,
|
|
17
|
+
FINAL_STATUSES,
|
|
18
|
+
ContextProfile,
|
|
19
|
+
EvalFailure,
|
|
20
|
+
EvalRun,
|
|
21
|
+
EvalRunMode,
|
|
22
|
+
EvalScore,
|
|
23
|
+
EvalTask,
|
|
24
|
+
EvalTaskSource,
|
|
25
|
+
ExternalHarness,
|
|
26
|
+
ExternalResult,
|
|
27
|
+
FailurePriority,
|
|
28
|
+
FinalStatus,
|
|
29
|
+
FixtureBundleManifest,
|
|
30
|
+
JsonValue,
|
|
31
|
+
NormalizedRun,
|
|
32
|
+
)
|
|
33
|
+
from .release import load_release_metadata, validate_release_metadata
|
|
34
|
+
from .samples import load_sample, validate_all_samples, validate_sample
|
|
35
|
+
from .schema_export import export_json_schemas
|
|
36
|
+
from .templates import (
|
|
37
|
+
render_eval_template,
|
|
38
|
+
supported_template_ids,
|
|
39
|
+
validate_eval_template,
|
|
40
|
+
validate_eval_template_file,
|
|
41
|
+
validate_template_directory,
|
|
42
|
+
)
|
|
43
|
+
from .validators import (
|
|
44
|
+
HARNESS_DIMENSION_NAMES,
|
|
45
|
+
validate_context_profile,
|
|
46
|
+
validate_eval_failure,
|
|
47
|
+
validate_eval_run,
|
|
48
|
+
validate_eval_score,
|
|
49
|
+
validate_eval_task,
|
|
50
|
+
validate_external_result,
|
|
51
|
+
validate_final_status,
|
|
52
|
+
validate_harness_fixture_components,
|
|
53
|
+
validate_priority,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
__all__ = [
|
|
57
|
+
"CONTEXT_PROFILES",
|
|
58
|
+
"EVAL_RUN_MODES",
|
|
59
|
+
"EVAL_TASK_SOURCES",
|
|
60
|
+
"EXTERNAL_HARNESSES",
|
|
61
|
+
"FAILURE_PRIORITIES",
|
|
62
|
+
"FINAL_STATUSES",
|
|
63
|
+
"HARNESS_DIMENSION_NAMES",
|
|
64
|
+
"ContextProfile",
|
|
65
|
+
"EvalFailure",
|
|
66
|
+
"EvalRun",
|
|
67
|
+
"EvalRunMode",
|
|
68
|
+
"EvalScore",
|
|
69
|
+
"EvalTask",
|
|
70
|
+
"EvalTaskSource",
|
|
71
|
+
"ExternalHarness",
|
|
72
|
+
"ExternalResult",
|
|
73
|
+
"FailurePriority",
|
|
74
|
+
"FinalStatus",
|
|
75
|
+
"FixtureBundleManifest",
|
|
76
|
+
"JsonValue",
|
|
77
|
+
"NormalizedRun",
|
|
78
|
+
"export_json_schemas",
|
|
79
|
+
"load_release_metadata",
|
|
80
|
+
"load_sample",
|
|
81
|
+
"normalize_external_result",
|
|
82
|
+
"normalize_swe_bench_result",
|
|
83
|
+
"normalize_terminal_bench_result",
|
|
84
|
+
"render_eval_template",
|
|
85
|
+
"run_clean_room_contract_check",
|
|
86
|
+
"supported_template_ids",
|
|
87
|
+
"to_swe_bench_format",
|
|
88
|
+
"to_terminal_bench_format",
|
|
89
|
+
"validate_all_samples",
|
|
90
|
+
"validate_context_profile",
|
|
91
|
+
"validate_eval_failure",
|
|
92
|
+
"validate_eval_run",
|
|
93
|
+
"validate_eval_score",
|
|
94
|
+
"validate_eval_task",
|
|
95
|
+
"validate_eval_template",
|
|
96
|
+
"validate_eval_template_file",
|
|
97
|
+
"validate_external_result",
|
|
98
|
+
"validate_final_status",
|
|
99
|
+
"validate_harness_fixture_components",
|
|
100
|
+
"validate_priority",
|
|
101
|
+
"validate_release_metadata",
|
|
102
|
+
"validate_sample",
|
|
103
|
+
"validate_template_directory",
|
|
104
|
+
]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from .samples import validate_all_samples
|
|
7
|
+
from .templates import validate_template_directory
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def run_clean_room_contract_check(
|
|
11
|
+
*,
|
|
12
|
+
template_root: Path,
|
|
13
|
+
sample_root: Path | None = None,
|
|
14
|
+
) -> dict[str, Any]:
|
|
15
|
+
templates = validate_template_directory(template_root)
|
|
16
|
+
samples = (
|
|
17
|
+
validate_all_samples()
|
|
18
|
+
if sample_root is None
|
|
19
|
+
else validate_all_samples(sample_root=sample_root)
|
|
20
|
+
)
|
|
21
|
+
return {
|
|
22
|
+
"ok": True,
|
|
23
|
+
"template_count": len(templates),
|
|
24
|
+
"sample_count": len(samples),
|
|
25
|
+
"templates": templates,
|
|
26
|
+
"samples": samples,
|
|
27
|
+
}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from .external import normalize_external_result
|
|
11
|
+
from .fixture_runner import write_contract_fixture_bundle
|
|
12
|
+
from .schema_export import export_json_schemas
|
|
13
|
+
from .validators import (
|
|
14
|
+
validate_eval_failure,
|
|
15
|
+
validate_eval_run,
|
|
16
|
+
validate_eval_score,
|
|
17
|
+
validate_eval_task,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _load_json(path: Path) -> dict[str, Any]:
|
|
22
|
+
loaded = json.loads(path.expanduser().resolve().read_text(encoding="utf-8"))
|
|
23
|
+
if not isinstance(loaded, dict):
|
|
24
|
+
raise ValueError(f"{path} must contain a JSON object")
|
|
25
|
+
return loaded
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _print_json(value: BaseModel | dict[str, Any] | list[str]) -> None:
|
|
29
|
+
payload = value.model_dump(mode="json") if isinstance(value, BaseModel) else value
|
|
30
|
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _run_fixtures(args: argparse.Namespace) -> int:
|
|
34
|
+
_print_json(write_contract_fixture_bundle(Path(args.output_dir)))
|
|
35
|
+
return 0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _run_schemas(args: argparse.Namespace) -> int:
|
|
39
|
+
_print_json({"schemas": export_json_schemas(Path(args.output_dir))})
|
|
40
|
+
return 0
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _run_validate(args: argparse.Namespace) -> int:
|
|
44
|
+
data = _load_json(Path(args.file))
|
|
45
|
+
validators = {
|
|
46
|
+
"task": validate_eval_task,
|
|
47
|
+
"run": validate_eval_run,
|
|
48
|
+
"score": validate_eval_score,
|
|
49
|
+
"failure": validate_eval_failure,
|
|
50
|
+
}
|
|
51
|
+
_print_json(validators[args.kind](data))
|
|
52
|
+
return 0
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _run_normalize(args: argparse.Namespace) -> int:
|
|
56
|
+
data = _load_json(Path(args.file))
|
|
57
|
+
normalized = normalize_external_result(
|
|
58
|
+
data,
|
|
59
|
+
eval_task_id=args.task_id,
|
|
60
|
+
harness=args.harness,
|
|
61
|
+
model=args.model,
|
|
62
|
+
)
|
|
63
|
+
_print_json(normalized)
|
|
64
|
+
return 0
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
68
|
+
parser = argparse.ArgumentParser(
|
|
69
|
+
prog="agent-eval-contract",
|
|
70
|
+
description="Validate, normalize, and export portable agent evaluation contracts.",
|
|
71
|
+
)
|
|
72
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
73
|
+
|
|
74
|
+
fixtures = subparsers.add_parser(
|
|
75
|
+
"fixtures", help="Write sample records, templates, and schemas."
|
|
76
|
+
)
|
|
77
|
+
fixtures.add_argument(
|
|
78
|
+
"--output-dir", required=True, help="Directory to write fixture artifacts into."
|
|
79
|
+
)
|
|
80
|
+
fixtures.set_defaults(func=_run_fixtures)
|
|
81
|
+
|
|
82
|
+
schemas = subparsers.add_parser(
|
|
83
|
+
"schemas", help="Export JSON Schemas for public contract models."
|
|
84
|
+
)
|
|
85
|
+
schemas.add_argument(
|
|
86
|
+
"--output-dir", required=True, help="Directory to write schema files into."
|
|
87
|
+
)
|
|
88
|
+
schemas.set_defaults(func=_run_schemas)
|
|
89
|
+
|
|
90
|
+
validate = subparsers.add_parser(
|
|
91
|
+
"validate", help="Validate a JSON record against a contract model."
|
|
92
|
+
)
|
|
93
|
+
validate.add_argument("--kind", choices=("task", "run", "score", "failure"), required=True)
|
|
94
|
+
validate.add_argument("--file", required=True, help="JSON file to validate.")
|
|
95
|
+
validate.set_defaults(func=_run_validate)
|
|
96
|
+
|
|
97
|
+
normalize = subparsers.add_parser("normalize", help="Normalize external harness output.")
|
|
98
|
+
normalize.add_argument("--harness", choices=("terminal-bench", "swe-bench"), required=True)
|
|
99
|
+
normalize.add_argument("--file", required=True, help="External result JSON file to normalize.")
|
|
100
|
+
normalize.add_argument("--task-id", help="Task id to use when the external result omits one.")
|
|
101
|
+
normalize.add_argument("--model", help="Model name to use when the external result omits one.")
|
|
102
|
+
normalize.set_defaults(func=_run_normalize)
|
|
103
|
+
|
|
104
|
+
return parser
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def main(argv: list[str] | None = None) -> int:
|
|
108
|
+
parser = build_parser()
|
|
109
|
+
args = parser.parse_args(argv)
|
|
110
|
+
return int(args.func(args))
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
if __name__ == "__main__":
|
|
114
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Mapping
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from .models import ExternalHarness, FinalStatus, JsonValue, NormalizedRun
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _as_mapping(value: object) -> Mapping[str, Any]:
|
|
10
|
+
if isinstance(value, Mapping):
|
|
11
|
+
return value
|
|
12
|
+
raise TypeError("external result must be a mapping")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _string_list(value: object) -> list[str]:
|
|
16
|
+
if isinstance(value, list):
|
|
17
|
+
return [str(item) for item in value]
|
|
18
|
+
if isinstance(value, tuple):
|
|
19
|
+
return [str(item) for item in value]
|
|
20
|
+
return []
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _json_metadata(data: Mapping[str, Any]) -> dict[str, JsonValue]:
|
|
24
|
+
return {
|
|
25
|
+
str(key): value
|
|
26
|
+
for key, value in data.items()
|
|
27
|
+
if isinstance(key, str) and _is_json_value(value)
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _is_json_value(value: object) -> bool:
|
|
32
|
+
if value is None or isinstance(value, str | int | float | bool):
|
|
33
|
+
return True
|
|
34
|
+
if isinstance(value, list):
|
|
35
|
+
return all(_is_json_value(item) for item in value)
|
|
36
|
+
if isinstance(value, dict):
|
|
37
|
+
return all(isinstance(key, str) and _is_json_value(item) for key, item in value.items())
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _passed_status(data: Mapping[str, Any]) -> FinalStatus:
|
|
42
|
+
passed_value = data.get("passed", data.get("success", data.get("resolved")))
|
|
43
|
+
if passed_value is None:
|
|
44
|
+
status = data.get("status")
|
|
45
|
+
if isinstance(status, str):
|
|
46
|
+
lowered = status.lower()
|
|
47
|
+
if lowered in {"passed", "pass", "success", "resolved"}:
|
|
48
|
+
return "success"
|
|
49
|
+
if lowered in {"failed", "fail", "failure", "unresolved"}:
|
|
50
|
+
return "failed"
|
|
51
|
+
if lowered in {"error", "errored"}:
|
|
52
|
+
return "error"
|
|
53
|
+
return "partial"
|
|
54
|
+
return "success" if bool(passed_value) else "failed"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _score(data: Mapping[str, Any]) -> float | None:
|
|
58
|
+
value = data.get("score")
|
|
59
|
+
if value is None:
|
|
60
|
+
return None
|
|
61
|
+
try:
|
|
62
|
+
numeric = float(value)
|
|
63
|
+
except (TypeError, ValueError):
|
|
64
|
+
return None
|
|
65
|
+
if 0.0 <= numeric <= 1.0:
|
|
66
|
+
return numeric
|
|
67
|
+
if 0.0 <= numeric <= 100.0:
|
|
68
|
+
return numeric / 100.0
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _duration_ms(data: Mapping[str, Any]) -> int | None:
|
|
73
|
+
value = data.get("duration_ms")
|
|
74
|
+
if value is None:
|
|
75
|
+
seconds = data.get("duration_seconds", data.get("elapsed_seconds"))
|
|
76
|
+
if seconds is None:
|
|
77
|
+
return None
|
|
78
|
+
try:
|
|
79
|
+
return int(float(seconds) * 1000)
|
|
80
|
+
except (TypeError, ValueError):
|
|
81
|
+
return None
|
|
82
|
+
try:
|
|
83
|
+
return int(value)
|
|
84
|
+
except (TypeError, ValueError):
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _terminal_checks(data: Mapping[str, Any]) -> list[str]:
|
|
89
|
+
checks = data.get("tests_run", data.get("checks"))
|
|
90
|
+
if checks is not None:
|
|
91
|
+
return _string_list(checks)
|
|
92
|
+
command = data.get("command")
|
|
93
|
+
return [str(command)] if command else []
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _swe_bench_checks(data: Mapping[str, Any]) -> list[str]:
|
|
97
|
+
checks = _string_list(data.get("tests_run", data.get("checks", [])))
|
|
98
|
+
if checks:
|
|
99
|
+
return checks
|
|
100
|
+
fail_to_pass = _string_list(data.get("FAIL_TO_PASS", data.get("fail_to_pass", [])))
|
|
101
|
+
pass_to_pass = _string_list(data.get("PASS_TO_PASS", data.get("pass_to_pass", [])))
|
|
102
|
+
return [*fail_to_pass, *pass_to_pass]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def to_swe_bench_format(eval_task: Mapping[str, Any]) -> dict[str, Any]:
|
|
106
|
+
task = _as_mapping(eval_task)
|
|
107
|
+
return {
|
|
108
|
+
"repo": task.get("repo"),
|
|
109
|
+
"instance_id": task.get("task_id"),
|
|
110
|
+
"problem_statement": task.get("description"),
|
|
111
|
+
"base_commit": task.get("start_revision"),
|
|
112
|
+
"FAIL_TO_PASS": task.get("fail_to_pass", []),
|
|
113
|
+
"PASS_TO_PASS": task.get("pass_to_pass", []),
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def to_terminal_bench_format(eval_task: Mapping[str, Any]) -> dict[str, Any]:
|
|
118
|
+
task = _as_mapping(eval_task)
|
|
119
|
+
return {
|
|
120
|
+
"task_id": task.get("task_id"),
|
|
121
|
+
"command": task.get("command", "pytest"),
|
|
122
|
+
"expected_exit_code": int(task.get("expected_exit_code", 0)),
|
|
123
|
+
"setup_commands": task.get("setup_commands", []),
|
|
124
|
+
"timeout_seconds": int(task.get("timeout_seconds", 600)),
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def normalize_external_result(
|
|
129
|
+
external_result: Mapping[str, Any],
|
|
130
|
+
*,
|
|
131
|
+
eval_task_id: str | None = None,
|
|
132
|
+
harness: ExternalHarness | str,
|
|
133
|
+
model: str | None = None,
|
|
134
|
+
) -> NormalizedRun:
|
|
135
|
+
if harness == "terminal-bench":
|
|
136
|
+
return normalize_terminal_bench_result(
|
|
137
|
+
external_result,
|
|
138
|
+
eval_task_id=eval_task_id,
|
|
139
|
+
model=model,
|
|
140
|
+
)
|
|
141
|
+
if harness == "swe-bench":
|
|
142
|
+
return normalize_swe_bench_result(
|
|
143
|
+
external_result,
|
|
144
|
+
eval_task_id=eval_task_id,
|
|
145
|
+
model=model,
|
|
146
|
+
)
|
|
147
|
+
data = _as_mapping(external_result)
|
|
148
|
+
task_id = eval_task_id or data.get("task_id") or data.get("instance_id")
|
|
149
|
+
if not task_id:
|
|
150
|
+
raise ValueError("eval_task_id is required when the external result has no task_id")
|
|
151
|
+
resolved_model = model or data.get("model") or "unknown"
|
|
152
|
+
checks = data.get("tests_run", data.get("checks", []))
|
|
153
|
+
return NormalizedRun(
|
|
154
|
+
task_id=str(task_id),
|
|
155
|
+
harness=str(harness),
|
|
156
|
+
model=str(resolved_model),
|
|
157
|
+
final_status=_passed_status(data),
|
|
158
|
+
checks=_string_list(checks),
|
|
159
|
+
duration_ms=_duration_ms(data),
|
|
160
|
+
score=_score(data),
|
|
161
|
+
metadata={
|
|
162
|
+
"source": "external_result",
|
|
163
|
+
"raw": _json_metadata(data),
|
|
164
|
+
},
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def normalize_terminal_bench_result(
|
|
169
|
+
external_result: Mapping[str, Any],
|
|
170
|
+
*,
|
|
171
|
+
eval_task_id: str | None = None,
|
|
172
|
+
model: str | None = None,
|
|
173
|
+
) -> NormalizedRun:
|
|
174
|
+
data = _as_mapping(external_result)
|
|
175
|
+
task_id = eval_task_id or data.get("task_id")
|
|
176
|
+
if not task_id:
|
|
177
|
+
raise ValueError("eval_task_id is required when the Terminal-Bench result has no task_id")
|
|
178
|
+
return NormalizedRun(
|
|
179
|
+
task_id=str(task_id),
|
|
180
|
+
harness="terminal-bench",
|
|
181
|
+
model=str(model or data.get("model") or "unknown"),
|
|
182
|
+
final_status=_passed_status(data),
|
|
183
|
+
checks=_terminal_checks(data),
|
|
184
|
+
duration_ms=_duration_ms(data),
|
|
185
|
+
score=_score(data),
|
|
186
|
+
metadata={
|
|
187
|
+
"source": "terminal-bench",
|
|
188
|
+
"raw": _json_metadata(data),
|
|
189
|
+
},
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def normalize_swe_bench_result(
|
|
194
|
+
external_result: Mapping[str, Any],
|
|
195
|
+
*,
|
|
196
|
+
eval_task_id: str | None = None,
|
|
197
|
+
model: str | None = None,
|
|
198
|
+
) -> NormalizedRun:
|
|
199
|
+
data = _as_mapping(external_result)
|
|
200
|
+
task_id = eval_task_id or data.get("instance_id") or data.get("task_id")
|
|
201
|
+
if not task_id:
|
|
202
|
+
raise ValueError("eval_task_id is required when the SWE-bench result has no instance_id")
|
|
203
|
+
return NormalizedRun(
|
|
204
|
+
task_id=str(task_id),
|
|
205
|
+
harness="swe-bench",
|
|
206
|
+
model=str(model or data.get("model_name_or_path") or data.get("model") or "unknown"),
|
|
207
|
+
final_status=_passed_status(data),
|
|
208
|
+
checks=_swe_bench_checks(data),
|
|
209
|
+
duration_ms=_duration_ms(data),
|
|
210
|
+
score=_score(data),
|
|
211
|
+
metadata={
|
|
212
|
+
"source": "swe-bench",
|
|
213
|
+
"raw": _json_metadata(data),
|
|
214
|
+
},
|
|
215
|
+
)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from .clean_room import run_clean_room_contract_check
|
|
9
|
+
from .models import FixtureBundleManifest
|
|
10
|
+
from .release import load_release_metadata
|
|
11
|
+
from .samples import SAMPLE_FILES, SAMPLE_ROOT, load_sample
|
|
12
|
+
from .schema_export import export_json_schemas
|
|
13
|
+
from .templates import render_eval_template, supported_template_ids
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def write_contract_fixture_bundle(
|
|
17
|
+
output_dir: Path,
|
|
18
|
+
*,
|
|
19
|
+
sample_root: Path = SAMPLE_ROOT,
|
|
20
|
+
) -> dict[str, Any]:
|
|
21
|
+
resolved = output_dir.expanduser().resolve()
|
|
22
|
+
samples_dir = resolved / "samples"
|
|
23
|
+
templates_dir = resolved / "templates"
|
|
24
|
+
schemas_dir = resolved / "schemas"
|
|
25
|
+
samples_dir.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
templates_dir.mkdir(parents=True, exist_ok=True)
|
|
27
|
+
schemas_dir.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
sample_ids: list[str] = []
|
|
30
|
+
for sample_id, filename in sorted(SAMPLE_FILES.items()):
|
|
31
|
+
sample = load_sample(sample_id, sample_root=sample_root)
|
|
32
|
+
(samples_dir / filename).write_text(
|
|
33
|
+
json.dumps(sample, indent=2, sort_keys=True) + "\n",
|
|
34
|
+
encoding="utf-8",
|
|
35
|
+
)
|
|
36
|
+
sample_ids.append(sample_id)
|
|
37
|
+
|
|
38
|
+
template_ids: list[str] = []
|
|
39
|
+
for template_id in supported_template_ids():
|
|
40
|
+
(templates_dir / f"{template_id}.md").write_text(
|
|
41
|
+
render_eval_template(template_id),
|
|
42
|
+
encoding="utf-8",
|
|
43
|
+
)
|
|
44
|
+
template_ids.append(template_id)
|
|
45
|
+
|
|
46
|
+
check = run_clean_room_contract_check(template_root=templates_dir, sample_root=samples_dir)
|
|
47
|
+
metadata = load_release_metadata()
|
|
48
|
+
schema_files = export_json_schemas(schemas_dir)
|
|
49
|
+
manifest = FixtureBundleManifest(
|
|
50
|
+
version=str(metadata["version"]),
|
|
51
|
+
contract_version=str(metadata["contract_version"]),
|
|
52
|
+
samples=sample_ids,
|
|
53
|
+
templates=template_ids,
|
|
54
|
+
schemas=schema_files,
|
|
55
|
+
metadata={
|
|
56
|
+
"clean_room_check": check,
|
|
57
|
+
"public_surfaces": metadata["public_surfaces"],
|
|
58
|
+
"out_of_scope": metadata["out_of_scope"],
|
|
59
|
+
},
|
|
60
|
+
).model_dump(mode="json")
|
|
61
|
+
(resolved / "manifest.json").write_text(
|
|
62
|
+
json.dumps(manifest, indent=2, sort_keys=True) + "\n",
|
|
63
|
+
encoding="utf-8",
|
|
64
|
+
)
|
|
65
|
+
return manifest
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def main(argv: list[str] | None = None) -> int:
|
|
69
|
+
parser = argparse.ArgumentParser(
|
|
70
|
+
description="Produce an agent-eval-contract fixture bundle with samples, templates, and JSON Schemas."
|
|
71
|
+
)
|
|
72
|
+
parser.add_argument(
|
|
73
|
+
"--output-dir", required=True, help="Directory to write fixture artifacts into."
|
|
74
|
+
)
|
|
75
|
+
args = parser.parse_args(argv)
|
|
76
|
+
manifest = write_contract_fixture_bundle(Path(args.output_dir))
|
|
77
|
+
print(json.dumps(manifest, indent=2, sort_keys=True))
|
|
78
|
+
return 0
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from typing import Annotated, Literal
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field, JsonValue, field_validator
|
|
7
|
+
|
|
8
|
+
ContextProfile = Literal[
|
|
9
|
+
"repo_only",
|
|
10
|
+
"provided_context",
|
|
11
|
+
"clean_room",
|
|
12
|
+
"tool_augmented",
|
|
13
|
+
"full_workspace",
|
|
14
|
+
]
|
|
15
|
+
EvalTaskSource = Literal[
|
|
16
|
+
"manual",
|
|
17
|
+
"ci",
|
|
18
|
+
"benchmark",
|
|
19
|
+
"production_trace",
|
|
20
|
+
"synthetic",
|
|
21
|
+
]
|
|
22
|
+
EvalRunMode = Literal[
|
|
23
|
+
"interactive",
|
|
24
|
+
"autonomous",
|
|
25
|
+
"shadow",
|
|
26
|
+
"replay",
|
|
27
|
+
"benchmark",
|
|
28
|
+
]
|
|
29
|
+
FinalStatus = Literal["success", "partial", "failed", "abandoned", "error"]
|
|
30
|
+
FailurePriority = Literal["low", "medium", "high", "critical"]
|
|
31
|
+
ExternalHarness = Literal["terminal-bench", "swe-bench"]
|
|
32
|
+
|
|
33
|
+
CONTEXT_PROFILES: frozenset[str] = frozenset(
|
|
34
|
+
{"repo_only", "provided_context", "clean_room", "tool_augmented", "full_workspace"}
|
|
35
|
+
)
|
|
36
|
+
EVAL_TASK_SOURCES: frozenset[str] = frozenset(
|
|
37
|
+
{"manual", "ci", "benchmark", "production_trace", "synthetic"}
|
|
38
|
+
)
|
|
39
|
+
EVAL_RUN_MODES: frozenset[str] = frozenset(
|
|
40
|
+
{"interactive", "autonomous", "shadow", "replay", "benchmark"}
|
|
41
|
+
)
|
|
42
|
+
FINAL_STATUSES: frozenset[str] = frozenset({"success", "partial", "failed", "abandoned", "error"})
|
|
43
|
+
FAILURE_PRIORITIES: frozenset[str] = frozenset({"low", "medium", "high", "critical"})
|
|
44
|
+
EXTERNAL_HARNESSES: frozenset[str] = frozenset({"terminal-bench", "swe-bench"})
|
|
45
|
+
|
|
46
|
+
Score = Annotated[float, Field(ge=0.0, le=1.0)]
|
|
47
|
+
NonNegativeInt = Annotated[int, Field(ge=0)]
|
|
48
|
+
NonNegativeFloat = Annotated[float, Field(ge=0.0)]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def utc_now() -> datetime:
|
|
52
|
+
return datetime.now(UTC)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ContractModel(BaseModel):
|
|
56
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class EvalTask(ContractModel):
|
|
60
|
+
task_id: str = Field(min_length=1)
|
|
61
|
+
title: str = Field(min_length=1)
|
|
62
|
+
description: str = Field(min_length=1)
|
|
63
|
+
source: EvalTaskSource = "manual"
|
|
64
|
+
context_profile: ContextProfile = "repo_only"
|
|
65
|
+
acceptance_criteria: list[str] = Field(default_factory=list)
|
|
66
|
+
repo: str | None = None
|
|
67
|
+
start_revision: str | None = None
|
|
68
|
+
tags: list[str] = Field(default_factory=list)
|
|
69
|
+
created_at: datetime = Field(default_factory=utc_now)
|
|
70
|
+
metadata: dict[str, JsonValue] = Field(default_factory=dict)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class EvalRun(ContractModel):
|
|
74
|
+
run_id: str = Field(min_length=1)
|
|
75
|
+
task_id: str = Field(min_length=1)
|
|
76
|
+
harness: str = Field(min_length=1)
|
|
77
|
+
model: str = Field(min_length=1)
|
|
78
|
+
mode: EvalRunMode = "interactive"
|
|
79
|
+
context_profile: ContextProfile = "repo_only"
|
|
80
|
+
final_status: FinalStatus
|
|
81
|
+
started_at: datetime | None = None
|
|
82
|
+
completed_at: datetime | None = None
|
|
83
|
+
duration_ms: NonNegativeInt | None = None
|
|
84
|
+
total_tokens: NonNegativeInt | None = None
|
|
85
|
+
estimated_cost_usd: NonNegativeFloat | None = None
|
|
86
|
+
tool_calls: NonNegativeInt = 0
|
|
87
|
+
failed_steps: NonNegativeInt = 0
|
|
88
|
+
files_changed: NonNegativeInt = 0
|
|
89
|
+
checks: list[str] = Field(default_factory=list)
|
|
90
|
+
output_summary: str | None = None
|
|
91
|
+
metadata: dict[str, JsonValue] = Field(default_factory=dict)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class EvalScore(ContractModel):
|
|
95
|
+
run_id: str = Field(min_length=1)
|
|
96
|
+
overall_score: Score
|
|
97
|
+
metrics: dict[str, Score] = Field(default_factory=dict)
|
|
98
|
+
passed: bool | None = None
|
|
99
|
+
reviewer_notes: str | None = None
|
|
100
|
+
metadata: dict[str, JsonValue] = Field(default_factory=dict)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class EvalFailure(ContractModel):
|
|
104
|
+
failure_id: str = Field(min_length=1)
|
|
105
|
+
run_id: str = Field(min_length=1)
|
|
106
|
+
failure_types: list[str] = Field(min_length=1)
|
|
107
|
+
summary: str = Field(min_length=1)
|
|
108
|
+
suspected_cause: str | None = None
|
|
109
|
+
affected_components: list[str] = Field(default_factory=list)
|
|
110
|
+
recommended_fixes: list[str] = Field(default_factory=list)
|
|
111
|
+
priority: FailurePriority = "medium"
|
|
112
|
+
regression_task_id: str | None = None
|
|
113
|
+
metadata: dict[str, JsonValue] = Field(default_factory=dict)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class ExternalResult(ContractModel):
|
|
117
|
+
harness: str = Field(min_length=1)
|
|
118
|
+
task_id: str | None = None
|
|
119
|
+
model: str | None = None
|
|
120
|
+
passed: bool | None = None
|
|
121
|
+
success: bool | None = None
|
|
122
|
+
score: Score | None = None
|
|
123
|
+
tests_run: list[str] = Field(default_factory=list)
|
|
124
|
+
duration_ms: NonNegativeInt | None = None
|
|
125
|
+
raw: dict[str, JsonValue] = Field(default_factory=dict)
|
|
126
|
+
metadata: dict[str, JsonValue] = Field(default_factory=dict)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class NormalizedRun(ContractModel):
|
|
130
|
+
task_id: str = Field(min_length=1)
|
|
131
|
+
harness: str = Field(min_length=1)
|
|
132
|
+
model: str = Field(min_length=1)
|
|
133
|
+
mode: EvalRunMode = "benchmark"
|
|
134
|
+
context_profile: ContextProfile = "clean_room"
|
|
135
|
+
final_status: FinalStatus
|
|
136
|
+
checks: list[str] = Field(default_factory=list)
|
|
137
|
+
duration_ms: NonNegativeInt | None = None
|
|
138
|
+
score: Score | None = None
|
|
139
|
+
metadata: dict[str, JsonValue] = Field(default_factory=dict)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class FixtureBundleManifest(ContractModel):
|
|
143
|
+
package: str = "agent-eval-contract"
|
|
144
|
+
version: str = Field(min_length=1)
|
|
145
|
+
contract_version: str = Field(min_length=1)
|
|
146
|
+
generated_at: datetime = Field(default_factory=utc_now)
|
|
147
|
+
samples: list[str] = Field(default_factory=list)
|
|
148
|
+
templates: list[str] = Field(default_factory=list)
|
|
149
|
+
schemas: list[str] = Field(default_factory=list)
|
|
150
|
+
metadata: dict[str, JsonValue] = Field(default_factory=dict)
|
|
151
|
+
|
|
152
|
+
@field_validator("package")
|
|
153
|
+
@classmethod
|
|
154
|
+
def package_name_matches(_cls, value: str) -> str:
|
|
155
|
+
if value != "agent-eval-contract":
|
|
156
|
+
raise ValueError("package must be agent-eval-contract")
|
|
157
|
+
return value
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|