agent-eval-contract 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,104 @@
1
+ from __future__ import annotations
2
+
3
+ from .clean_room import run_clean_room_contract_check
4
+ from .external import (
5
+ normalize_external_result,
6
+ normalize_swe_bench_result,
7
+ normalize_terminal_bench_result,
8
+ to_swe_bench_format,
9
+ to_terminal_bench_format,
10
+ )
11
+ from .models import (
12
+ CONTEXT_PROFILES,
13
+ EVAL_RUN_MODES,
14
+ EVAL_TASK_SOURCES,
15
+ EXTERNAL_HARNESSES,
16
+ FAILURE_PRIORITIES,
17
+ FINAL_STATUSES,
18
+ ContextProfile,
19
+ EvalFailure,
20
+ EvalRun,
21
+ EvalRunMode,
22
+ EvalScore,
23
+ EvalTask,
24
+ EvalTaskSource,
25
+ ExternalHarness,
26
+ ExternalResult,
27
+ FailurePriority,
28
+ FinalStatus,
29
+ FixtureBundleManifest,
30
+ JsonValue,
31
+ NormalizedRun,
32
+ )
33
+ from .release import load_release_metadata, validate_release_metadata
34
+ from .samples import load_sample, validate_all_samples, validate_sample
35
+ from .schema_export import export_json_schemas
36
+ from .templates import (
37
+ render_eval_template,
38
+ supported_template_ids,
39
+ validate_eval_template,
40
+ validate_eval_template_file,
41
+ validate_template_directory,
42
+ )
43
+ from .validators import (
44
+ HARNESS_DIMENSION_NAMES,
45
+ validate_context_profile,
46
+ validate_eval_failure,
47
+ validate_eval_run,
48
+ validate_eval_score,
49
+ validate_eval_task,
50
+ validate_external_result,
51
+ validate_final_status,
52
+ validate_harness_fixture_components,
53
+ validate_priority,
54
+ )
55
+
56
+ __all__ = [
57
+ "CONTEXT_PROFILES",
58
+ "EVAL_RUN_MODES",
59
+ "EVAL_TASK_SOURCES",
60
+ "EXTERNAL_HARNESSES",
61
+ "FAILURE_PRIORITIES",
62
+ "FINAL_STATUSES",
63
+ "HARNESS_DIMENSION_NAMES",
64
+ "ContextProfile",
65
+ "EvalFailure",
66
+ "EvalRun",
67
+ "EvalRunMode",
68
+ "EvalScore",
69
+ "EvalTask",
70
+ "EvalTaskSource",
71
+ "ExternalHarness",
72
+ "ExternalResult",
73
+ "FailurePriority",
74
+ "FinalStatus",
75
+ "FixtureBundleManifest",
76
+ "JsonValue",
77
+ "NormalizedRun",
78
+ "export_json_schemas",
79
+ "load_release_metadata",
80
+ "load_sample",
81
+ "normalize_external_result",
82
+ "normalize_swe_bench_result",
83
+ "normalize_terminal_bench_result",
84
+ "render_eval_template",
85
+ "run_clean_room_contract_check",
86
+ "supported_template_ids",
87
+ "to_swe_bench_format",
88
+ "to_terminal_bench_format",
89
+ "validate_all_samples",
90
+ "validate_context_profile",
91
+ "validate_eval_failure",
92
+ "validate_eval_run",
93
+ "validate_eval_score",
94
+ "validate_eval_task",
95
+ "validate_eval_template",
96
+ "validate_eval_template_file",
97
+ "validate_external_result",
98
+ "validate_final_status",
99
+ "validate_harness_fixture_components",
100
+ "validate_priority",
101
+ "validate_release_metadata",
102
+ "validate_sample",
103
+ "validate_template_directory",
104
+ ]
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from .samples import validate_all_samples
7
+ from .templates import validate_template_directory
8
+
9
+
10
+ def run_clean_room_contract_check(
11
+ *,
12
+ template_root: Path,
13
+ sample_root: Path | None = None,
14
+ ) -> dict[str, Any]:
15
+ templates = validate_template_directory(template_root)
16
+ samples = (
17
+ validate_all_samples()
18
+ if sample_root is None
19
+ else validate_all_samples(sample_root=sample_root)
20
+ )
21
+ return {
22
+ "ok": True,
23
+ "template_count": len(templates),
24
+ "sample_count": len(samples),
25
+ "templates": templates,
26
+ "samples": samples,
27
+ }
@@ -0,0 +1,114 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from pydantic import BaseModel
9
+
10
+ from .external import normalize_external_result
11
+ from .fixture_runner import write_contract_fixture_bundle
12
+ from .schema_export import export_json_schemas
13
+ from .validators import (
14
+ validate_eval_failure,
15
+ validate_eval_run,
16
+ validate_eval_score,
17
+ validate_eval_task,
18
+ )
19
+
20
+
21
+ def _load_json(path: Path) -> dict[str, Any]:
22
+ loaded = json.loads(path.expanduser().resolve().read_text(encoding="utf-8"))
23
+ if not isinstance(loaded, dict):
24
+ raise ValueError(f"{path} must contain a JSON object")
25
+ return loaded
26
+
27
+
28
+ def _print_json(value: BaseModel | dict[str, Any] | list[str]) -> None:
29
+ payload = value.model_dump(mode="json") if isinstance(value, BaseModel) else value
30
+ print(json.dumps(payload, indent=2, sort_keys=True))
31
+
32
+
33
+ def _run_fixtures(args: argparse.Namespace) -> int:
34
+ _print_json(write_contract_fixture_bundle(Path(args.output_dir)))
35
+ return 0
36
+
37
+
38
+ def _run_schemas(args: argparse.Namespace) -> int:
39
+ _print_json({"schemas": export_json_schemas(Path(args.output_dir))})
40
+ return 0
41
+
42
+
43
+ def _run_validate(args: argparse.Namespace) -> int:
44
+ data = _load_json(Path(args.file))
45
+ validators = {
46
+ "task": validate_eval_task,
47
+ "run": validate_eval_run,
48
+ "score": validate_eval_score,
49
+ "failure": validate_eval_failure,
50
+ }
51
+ _print_json(validators[args.kind](data))
52
+ return 0
53
+
54
+
55
+ def _run_normalize(args: argparse.Namespace) -> int:
56
+ data = _load_json(Path(args.file))
57
+ normalized = normalize_external_result(
58
+ data,
59
+ eval_task_id=args.task_id,
60
+ harness=args.harness,
61
+ model=args.model,
62
+ )
63
+ _print_json(normalized)
64
+ return 0
65
+
66
+
67
+ def build_parser() -> argparse.ArgumentParser:
68
+ parser = argparse.ArgumentParser(
69
+ prog="agent-eval-contract",
70
+ description="Validate, normalize, and export portable agent evaluation contracts.",
71
+ )
72
+ subparsers = parser.add_subparsers(dest="command", required=True)
73
+
74
+ fixtures = subparsers.add_parser(
75
+ "fixtures", help="Write sample records, templates, and schemas."
76
+ )
77
+ fixtures.add_argument(
78
+ "--output-dir", required=True, help="Directory to write fixture artifacts into."
79
+ )
80
+ fixtures.set_defaults(func=_run_fixtures)
81
+
82
+ schemas = subparsers.add_parser(
83
+ "schemas", help="Export JSON Schemas for public contract models."
84
+ )
85
+ schemas.add_argument(
86
+ "--output-dir", required=True, help="Directory to write schema files into."
87
+ )
88
+ schemas.set_defaults(func=_run_schemas)
89
+
90
+ validate = subparsers.add_parser(
91
+ "validate", help="Validate a JSON record against a contract model."
92
+ )
93
+ validate.add_argument("--kind", choices=("task", "run", "score", "failure"), required=True)
94
+ validate.add_argument("--file", required=True, help="JSON file to validate.")
95
+ validate.set_defaults(func=_run_validate)
96
+
97
+ normalize = subparsers.add_parser("normalize", help="Normalize external harness output.")
98
+ normalize.add_argument("--harness", choices=("terminal-bench", "swe-bench"), required=True)
99
+ normalize.add_argument("--file", required=True, help="External result JSON file to normalize.")
100
+ normalize.add_argument("--task-id", help="Task id to use when the external result omits one.")
101
+ normalize.add_argument("--model", help="Model name to use when the external result omits one.")
102
+ normalize.set_defaults(func=_run_normalize)
103
+
104
+ return parser
105
+
106
+
107
+ def main(argv: list[str] | None = None) -> int:
108
+ parser = build_parser()
109
+ args = parser.parse_args(argv)
110
+ return int(args.func(args))
111
+
112
+
113
+ if __name__ == "__main__":
114
+ raise SystemExit(main())
@@ -0,0 +1,215 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Mapping
4
+ from typing import Any
5
+
6
+ from .models import ExternalHarness, FinalStatus, JsonValue, NormalizedRun
7
+
8
+
9
+ def _as_mapping(value: object) -> Mapping[str, Any]:
10
+ if isinstance(value, Mapping):
11
+ return value
12
+ raise TypeError("external result must be a mapping")
13
+
14
+
15
+ def _string_list(value: object) -> list[str]:
16
+ if isinstance(value, list):
17
+ return [str(item) for item in value]
18
+ if isinstance(value, tuple):
19
+ return [str(item) for item in value]
20
+ return []
21
+
22
+
23
+ def _json_metadata(data: Mapping[str, Any]) -> dict[str, JsonValue]:
24
+ return {
25
+ str(key): value
26
+ for key, value in data.items()
27
+ if isinstance(key, str) and _is_json_value(value)
28
+ }
29
+
30
+
31
+ def _is_json_value(value: object) -> bool:
32
+ if value is None or isinstance(value, str | int | float | bool):
33
+ return True
34
+ if isinstance(value, list):
35
+ return all(_is_json_value(item) for item in value)
36
+ if isinstance(value, dict):
37
+ return all(isinstance(key, str) and _is_json_value(item) for key, item in value.items())
38
+ return False
39
+
40
+
41
+ def _passed_status(data: Mapping[str, Any]) -> FinalStatus:
42
+ passed_value = data.get("passed", data.get("success", data.get("resolved")))
43
+ if passed_value is None:
44
+ status = data.get("status")
45
+ if isinstance(status, str):
46
+ lowered = status.lower()
47
+ if lowered in {"passed", "pass", "success", "resolved"}:
48
+ return "success"
49
+ if lowered in {"failed", "fail", "failure", "unresolved"}:
50
+ return "failed"
51
+ if lowered in {"error", "errored"}:
52
+ return "error"
53
+ return "partial"
54
+ return "success" if bool(passed_value) else "failed"
55
+
56
+
57
+ def _score(data: Mapping[str, Any]) -> float | None:
58
+ value = data.get("score")
59
+ if value is None:
60
+ return None
61
+ try:
62
+ numeric = float(value)
63
+ except (TypeError, ValueError):
64
+ return None
65
+ if 0.0 <= numeric <= 1.0:
66
+ return numeric
67
+ if 0.0 <= numeric <= 100.0:
68
+ return numeric / 100.0
69
+ return None
70
+
71
+
72
+ def _duration_ms(data: Mapping[str, Any]) -> int | None:
73
+ value = data.get("duration_ms")
74
+ if value is None:
75
+ seconds = data.get("duration_seconds", data.get("elapsed_seconds"))
76
+ if seconds is None:
77
+ return None
78
+ try:
79
+ return int(float(seconds) * 1000)
80
+ except (TypeError, ValueError):
81
+ return None
82
+ try:
83
+ return int(value)
84
+ except (TypeError, ValueError):
85
+ return None
86
+
87
+
88
+ def _terminal_checks(data: Mapping[str, Any]) -> list[str]:
89
+ checks = data.get("tests_run", data.get("checks"))
90
+ if checks is not None:
91
+ return _string_list(checks)
92
+ command = data.get("command")
93
+ return [str(command)] if command else []
94
+
95
+
96
+ def _swe_bench_checks(data: Mapping[str, Any]) -> list[str]:
97
+ checks = _string_list(data.get("tests_run", data.get("checks", [])))
98
+ if checks:
99
+ return checks
100
+ fail_to_pass = _string_list(data.get("FAIL_TO_PASS", data.get("fail_to_pass", [])))
101
+ pass_to_pass = _string_list(data.get("PASS_TO_PASS", data.get("pass_to_pass", [])))
102
+ return [*fail_to_pass, *pass_to_pass]
103
+
104
+
105
+ def to_swe_bench_format(eval_task: Mapping[str, Any]) -> dict[str, Any]:
106
+ task = _as_mapping(eval_task)
107
+ return {
108
+ "repo": task.get("repo"),
109
+ "instance_id": task.get("task_id"),
110
+ "problem_statement": task.get("description"),
111
+ "base_commit": task.get("start_revision"),
112
+ "FAIL_TO_PASS": task.get("fail_to_pass", []),
113
+ "PASS_TO_PASS": task.get("pass_to_pass", []),
114
+ }
115
+
116
+
117
+ def to_terminal_bench_format(eval_task: Mapping[str, Any]) -> dict[str, Any]:
118
+ task = _as_mapping(eval_task)
119
+ return {
120
+ "task_id": task.get("task_id"),
121
+ "command": task.get("command", "pytest"),
122
+ "expected_exit_code": int(task.get("expected_exit_code", 0)),
123
+ "setup_commands": task.get("setup_commands", []),
124
+ "timeout_seconds": int(task.get("timeout_seconds", 600)),
125
+ }
126
+
127
+
128
+ def normalize_external_result(
129
+ external_result: Mapping[str, Any],
130
+ *,
131
+ eval_task_id: str | None = None,
132
+ harness: ExternalHarness | str,
133
+ model: str | None = None,
134
+ ) -> NormalizedRun:
135
+ if harness == "terminal-bench":
136
+ return normalize_terminal_bench_result(
137
+ external_result,
138
+ eval_task_id=eval_task_id,
139
+ model=model,
140
+ )
141
+ if harness == "swe-bench":
142
+ return normalize_swe_bench_result(
143
+ external_result,
144
+ eval_task_id=eval_task_id,
145
+ model=model,
146
+ )
147
+ data = _as_mapping(external_result)
148
+ task_id = eval_task_id or data.get("task_id") or data.get("instance_id")
149
+ if not task_id:
150
+ raise ValueError("eval_task_id is required when the external result has no task_id")
151
+ resolved_model = model or data.get("model") or "unknown"
152
+ checks = data.get("tests_run", data.get("checks", []))
153
+ return NormalizedRun(
154
+ task_id=str(task_id),
155
+ harness=str(harness),
156
+ model=str(resolved_model),
157
+ final_status=_passed_status(data),
158
+ checks=_string_list(checks),
159
+ duration_ms=_duration_ms(data),
160
+ score=_score(data),
161
+ metadata={
162
+ "source": "external_result",
163
+ "raw": _json_metadata(data),
164
+ },
165
+ )
166
+
167
+
168
+ def normalize_terminal_bench_result(
169
+ external_result: Mapping[str, Any],
170
+ *,
171
+ eval_task_id: str | None = None,
172
+ model: str | None = None,
173
+ ) -> NormalizedRun:
174
+ data = _as_mapping(external_result)
175
+ task_id = eval_task_id or data.get("task_id")
176
+ if not task_id:
177
+ raise ValueError("eval_task_id is required when the Terminal-Bench result has no task_id")
178
+ return NormalizedRun(
179
+ task_id=str(task_id),
180
+ harness="terminal-bench",
181
+ model=str(model or data.get("model") or "unknown"),
182
+ final_status=_passed_status(data),
183
+ checks=_terminal_checks(data),
184
+ duration_ms=_duration_ms(data),
185
+ score=_score(data),
186
+ metadata={
187
+ "source": "terminal-bench",
188
+ "raw": _json_metadata(data),
189
+ },
190
+ )
191
+
192
+
193
+ def normalize_swe_bench_result(
194
+ external_result: Mapping[str, Any],
195
+ *,
196
+ eval_task_id: str | None = None,
197
+ model: str | None = None,
198
+ ) -> NormalizedRun:
199
+ data = _as_mapping(external_result)
200
+ task_id = eval_task_id or data.get("instance_id") or data.get("task_id")
201
+ if not task_id:
202
+ raise ValueError("eval_task_id is required when the SWE-bench result has no instance_id")
203
+ return NormalizedRun(
204
+ task_id=str(task_id),
205
+ harness="swe-bench",
206
+ model=str(model or data.get("model_name_or_path") or data.get("model") or "unknown"),
207
+ final_status=_passed_status(data),
208
+ checks=_swe_bench_checks(data),
209
+ duration_ms=_duration_ms(data),
210
+ score=_score(data),
211
+ metadata={
212
+ "source": "swe-bench",
213
+ "raw": _json_metadata(data),
214
+ },
215
+ )
@@ -0,0 +1,82 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from .clean_room import run_clean_room_contract_check
9
+ from .models import FixtureBundleManifest
10
+ from .release import load_release_metadata
11
+ from .samples import SAMPLE_FILES, SAMPLE_ROOT, load_sample
12
+ from .schema_export import export_json_schemas
13
+ from .templates import render_eval_template, supported_template_ids
14
+
15
+
16
+ def write_contract_fixture_bundle(
17
+ output_dir: Path,
18
+ *,
19
+ sample_root: Path = SAMPLE_ROOT,
20
+ ) -> dict[str, Any]:
21
+ resolved = output_dir.expanduser().resolve()
22
+ samples_dir = resolved / "samples"
23
+ templates_dir = resolved / "templates"
24
+ schemas_dir = resolved / "schemas"
25
+ samples_dir.mkdir(parents=True, exist_ok=True)
26
+ templates_dir.mkdir(parents=True, exist_ok=True)
27
+ schemas_dir.mkdir(parents=True, exist_ok=True)
28
+
29
+ sample_ids: list[str] = []
30
+ for sample_id, filename in sorted(SAMPLE_FILES.items()):
31
+ sample = load_sample(sample_id, sample_root=sample_root)
32
+ (samples_dir / filename).write_text(
33
+ json.dumps(sample, indent=2, sort_keys=True) + "\n",
34
+ encoding="utf-8",
35
+ )
36
+ sample_ids.append(sample_id)
37
+
38
+ template_ids: list[str] = []
39
+ for template_id in supported_template_ids():
40
+ (templates_dir / f"{template_id}.md").write_text(
41
+ render_eval_template(template_id),
42
+ encoding="utf-8",
43
+ )
44
+ template_ids.append(template_id)
45
+
46
+ check = run_clean_room_contract_check(template_root=templates_dir, sample_root=samples_dir)
47
+ metadata = load_release_metadata()
48
+ schema_files = export_json_schemas(schemas_dir)
49
+ manifest = FixtureBundleManifest(
50
+ version=str(metadata["version"]),
51
+ contract_version=str(metadata["contract_version"]),
52
+ samples=sample_ids,
53
+ templates=template_ids,
54
+ schemas=schema_files,
55
+ metadata={
56
+ "clean_room_check": check,
57
+ "public_surfaces": metadata["public_surfaces"],
58
+ "out_of_scope": metadata["out_of_scope"],
59
+ },
60
+ ).model_dump(mode="json")
61
+ (resolved / "manifest.json").write_text(
62
+ json.dumps(manifest, indent=2, sort_keys=True) + "\n",
63
+ encoding="utf-8",
64
+ )
65
+ return manifest
66
+
67
+
68
+ def main(argv: list[str] | None = None) -> int:
69
+ parser = argparse.ArgumentParser(
70
+ description="Produce an agent-eval-contract fixture bundle with samples, templates, and JSON Schemas."
71
+ )
72
+ parser.add_argument(
73
+ "--output-dir", required=True, help="Directory to write fixture artifacts into."
74
+ )
75
+ args = parser.parse_args(argv)
76
+ manifest = write_contract_fixture_bundle(Path(args.output_dir))
77
+ print(json.dumps(manifest, indent=2, sort_keys=True))
78
+ return 0
79
+
80
+
81
+ if __name__ == "__main__":
82
+ raise SystemExit(main())
@@ -0,0 +1,157 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import UTC, datetime
4
+ from typing import Annotated, Literal
5
+
6
+ from pydantic import BaseModel, ConfigDict, Field, JsonValue, field_validator
7
+
8
+ ContextProfile = Literal[
9
+ "repo_only",
10
+ "provided_context",
11
+ "clean_room",
12
+ "tool_augmented",
13
+ "full_workspace",
14
+ ]
15
+ EvalTaskSource = Literal[
16
+ "manual",
17
+ "ci",
18
+ "benchmark",
19
+ "production_trace",
20
+ "synthetic",
21
+ ]
22
+ EvalRunMode = Literal[
23
+ "interactive",
24
+ "autonomous",
25
+ "shadow",
26
+ "replay",
27
+ "benchmark",
28
+ ]
29
+ FinalStatus = Literal["success", "partial", "failed", "abandoned", "error"]
30
+ FailurePriority = Literal["low", "medium", "high", "critical"]
31
+ ExternalHarness = Literal["terminal-bench", "swe-bench"]
32
+
33
+ CONTEXT_PROFILES: frozenset[str] = frozenset(
34
+ {"repo_only", "provided_context", "clean_room", "tool_augmented", "full_workspace"}
35
+ )
36
+ EVAL_TASK_SOURCES: frozenset[str] = frozenset(
37
+ {"manual", "ci", "benchmark", "production_trace", "synthetic"}
38
+ )
39
+ EVAL_RUN_MODES: frozenset[str] = frozenset(
40
+ {"interactive", "autonomous", "shadow", "replay", "benchmark"}
41
+ )
42
+ FINAL_STATUSES: frozenset[str] = frozenset({"success", "partial", "failed", "abandoned", "error"})
43
+ FAILURE_PRIORITIES: frozenset[str] = frozenset({"low", "medium", "high", "critical"})
44
+ EXTERNAL_HARNESSES: frozenset[str] = frozenset({"terminal-bench", "swe-bench"})
45
+
46
+ Score = Annotated[float, Field(ge=0.0, le=1.0)]
47
+ NonNegativeInt = Annotated[int, Field(ge=0)]
48
+ NonNegativeFloat = Annotated[float, Field(ge=0.0)]
49
+
50
+
51
+ def utc_now() -> datetime:
52
+ return datetime.now(UTC)
53
+
54
+
55
+ class ContractModel(BaseModel):
56
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
57
+
58
+
59
+ class EvalTask(ContractModel):
60
+ task_id: str = Field(min_length=1)
61
+ title: str = Field(min_length=1)
62
+ description: str = Field(min_length=1)
63
+ source: EvalTaskSource = "manual"
64
+ context_profile: ContextProfile = "repo_only"
65
+ acceptance_criteria: list[str] = Field(default_factory=list)
66
+ repo: str | None = None
67
+ start_revision: str | None = None
68
+ tags: list[str] = Field(default_factory=list)
69
+ created_at: datetime = Field(default_factory=utc_now)
70
+ metadata: dict[str, JsonValue] = Field(default_factory=dict)
71
+
72
+
73
+ class EvalRun(ContractModel):
74
+ run_id: str = Field(min_length=1)
75
+ task_id: str = Field(min_length=1)
76
+ harness: str = Field(min_length=1)
77
+ model: str = Field(min_length=1)
78
+ mode: EvalRunMode = "interactive"
79
+ context_profile: ContextProfile = "repo_only"
80
+ final_status: FinalStatus
81
+ started_at: datetime | None = None
82
+ completed_at: datetime | None = None
83
+ duration_ms: NonNegativeInt | None = None
84
+ total_tokens: NonNegativeInt | None = None
85
+ estimated_cost_usd: NonNegativeFloat | None = None
86
+ tool_calls: NonNegativeInt = 0
87
+ failed_steps: NonNegativeInt = 0
88
+ files_changed: NonNegativeInt = 0
89
+ checks: list[str] = Field(default_factory=list)
90
+ output_summary: str | None = None
91
+ metadata: dict[str, JsonValue] = Field(default_factory=dict)
92
+
93
+
94
+ class EvalScore(ContractModel):
95
+ run_id: str = Field(min_length=1)
96
+ overall_score: Score
97
+ metrics: dict[str, Score] = Field(default_factory=dict)
98
+ passed: bool | None = None
99
+ reviewer_notes: str | None = None
100
+ metadata: dict[str, JsonValue] = Field(default_factory=dict)
101
+
102
+
103
+ class EvalFailure(ContractModel):
104
+ failure_id: str = Field(min_length=1)
105
+ run_id: str = Field(min_length=1)
106
+ failure_types: list[str] = Field(min_length=1)
107
+ summary: str = Field(min_length=1)
108
+ suspected_cause: str | None = None
109
+ affected_components: list[str] = Field(default_factory=list)
110
+ recommended_fixes: list[str] = Field(default_factory=list)
111
+ priority: FailurePriority = "medium"
112
+ regression_task_id: str | None = None
113
+ metadata: dict[str, JsonValue] = Field(default_factory=dict)
114
+
115
+
116
+ class ExternalResult(ContractModel):
117
+ harness: str = Field(min_length=1)
118
+ task_id: str | None = None
119
+ model: str | None = None
120
+ passed: bool | None = None
121
+ success: bool | None = None
122
+ score: Score | None = None
123
+ tests_run: list[str] = Field(default_factory=list)
124
+ duration_ms: NonNegativeInt | None = None
125
+ raw: dict[str, JsonValue] = Field(default_factory=dict)
126
+ metadata: dict[str, JsonValue] = Field(default_factory=dict)
127
+
128
+
129
+ class NormalizedRun(ContractModel):
130
+ task_id: str = Field(min_length=1)
131
+ harness: str = Field(min_length=1)
132
+ model: str = Field(min_length=1)
133
+ mode: EvalRunMode = "benchmark"
134
+ context_profile: ContextProfile = "clean_room"
135
+ final_status: FinalStatus
136
+ checks: list[str] = Field(default_factory=list)
137
+ duration_ms: NonNegativeInt | None = None
138
+ score: Score | None = None
139
+ metadata: dict[str, JsonValue] = Field(default_factory=dict)
140
+
141
+
142
+ class FixtureBundleManifest(ContractModel):
143
+ package: str = "agent-eval-contract"
144
+ version: str = Field(min_length=1)
145
+ contract_version: str = Field(min_length=1)
146
+ generated_at: datetime = Field(default_factory=utc_now)
147
+ samples: list[str] = Field(default_factory=list)
148
+ templates: list[str] = Field(default_factory=list)
149
+ schemas: list[str] = Field(default_factory=list)
150
+ metadata: dict[str, JsonValue] = Field(default_factory=dict)
151
+
152
+ @field_validator("package")
153
+ @classmethod
154
+ def package_name_matches(_cls, value: str) -> str:
155
+ if value != "agent-eval-contract":
156
+ raise ValueError("package must be agent-eval-contract")
157
+ return value
@@ -0,0 +1 @@
1
+