agent-eval-contract 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_eval_contract/__init__.py +104 -0
- agent_eval_contract/clean_room.py +27 -0
- agent_eval_contract/cli.py +114 -0
- agent_eval_contract/external.py +215 -0
- agent_eval_contract/fixture_runner.py +82 -0
- agent_eval_contract/models.py +157 -0
- agent_eval_contract/py.typed +1 -0
- agent_eval_contract/release.py +52 -0
- agent_eval_contract/release_metadata.json +39 -0
- agent_eval_contract/samples/eval_failure.json +22 -0
- agent_eval_contract/samples/eval_run.json +25 -0
- agent_eval_contract/samples/eval_score.json +16 -0
- agent_eval_contract/samples/eval_task.json +23 -0
- agent_eval_contract/samples/external_result_normalization.json +39 -0
- agent_eval_contract/samples.py +81 -0
- agent_eval_contract/schema_export.py +42 -0
- agent_eval_contract/schemas.py +47 -0
- agent_eval_contract/templates.py +101 -0
- agent_eval_contract/validators.py +116 -0
- agent_eval_contract-0.2.0.dist-info/METADATA +107 -0
- agent_eval_contract-0.2.0.dist-info/RECORD +25 -0
- agent_eval_contract-0.2.0.dist-info/WHEEL +5 -0
- agent_eval_contract-0.2.0.dist-info/entry_points.txt +3 -0
- agent_eval_contract-0.2.0.dist-info/licenses/LICENSE +21 -0
- agent_eval_contract-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
RELEASE_METADATA_PATH = Path(__file__).resolve().parent / "release_metadata.json"
|
|
8
|
+
REQUIRED_RELEASE_METADATA_KEYS = (
|
|
9
|
+
"package_name",
|
|
10
|
+
"version",
|
|
11
|
+
"contract_version",
|
|
12
|
+
"status",
|
|
13
|
+
"python_requires",
|
|
14
|
+
"public_promise",
|
|
15
|
+
"public_surfaces",
|
|
16
|
+
"out_of_scope",
|
|
17
|
+
"public_modules",
|
|
18
|
+
"release_blockers",
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def load_release_metadata(path: Path = RELEASE_METADATA_PATH) -> dict[str, Any]:
|
|
23
|
+
loaded = json.loads(path.read_text(encoding="utf-8"))
|
|
24
|
+
if not isinstance(loaded, dict):
|
|
25
|
+
raise ValueError("Agent eval release metadata must be a JSON object.")
|
|
26
|
+
validate_release_metadata(loaded)
|
|
27
|
+
return loaded
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def validate_release_metadata(metadata: dict[str, Any]) -> None:
|
|
31
|
+
missing = [key for key in REQUIRED_RELEASE_METADATA_KEYS if key not in metadata]
|
|
32
|
+
if missing:
|
|
33
|
+
raise ValueError(f"Agent eval release metadata is missing keys: {', '.join(missing)}")
|
|
34
|
+
if metadata["package_name"] != "agent-eval-contract":
|
|
35
|
+
raise ValueError("Agent eval release metadata package_name must be agent-eval-contract.")
|
|
36
|
+
if metadata["status"] not in {"public_package_candidate", "published"}:
|
|
37
|
+
raise ValueError("Agent eval release metadata status is invalid.")
|
|
38
|
+
if not isinstance(metadata["public_promise"], str) or not metadata["public_promise"].strip():
|
|
39
|
+
raise ValueError("Agent eval release metadata public_promise must be a non-empty string.")
|
|
40
|
+
for key in ("public_surfaces", "out_of_scope", "public_modules"):
|
|
41
|
+
value = metadata[key]
|
|
42
|
+
if (
|
|
43
|
+
not isinstance(value, list)
|
|
44
|
+
or not value
|
|
45
|
+
or not all(isinstance(item, str) for item in value)
|
|
46
|
+
):
|
|
47
|
+
raise ValueError(f"Agent eval release metadata field '{key}' must be a string list.")
|
|
48
|
+
blockers = metadata["release_blockers"]
|
|
49
|
+
if not isinstance(blockers, list) or not all(isinstance(item, str) for item in blockers):
|
|
50
|
+
raise ValueError(
|
|
51
|
+
"Agent eval release metadata field 'release_blockers' must be a string list."
|
|
52
|
+
)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"package_name": "agent-eval-contract",
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"contract_version": "0.1",
|
|
5
|
+
"status": "published",
|
|
6
|
+
"python_requires": ">=3.12",
|
|
7
|
+
"public_promise": "Define, validate, serialize, export JSON Schema for, and normalize portable agent evaluation records.",
|
|
8
|
+
"public_surfaces": [
|
|
9
|
+
"Pydantic evaluation record models",
|
|
10
|
+
"runtime validation helpers",
|
|
11
|
+
"JSON Schema export",
|
|
12
|
+
"external harness result normalization",
|
|
13
|
+
"sample records",
|
|
14
|
+
"template rendering and validation",
|
|
15
|
+
"fixture bundle generation",
|
|
16
|
+
"agent-eval-contract CLI"
|
|
17
|
+
],
|
|
18
|
+
"out_of_scope": [
|
|
19
|
+
"evaluation execution",
|
|
20
|
+
"model provider calls",
|
|
21
|
+
"leaderboard hosting",
|
|
22
|
+
"dashboard storage",
|
|
23
|
+
"private workflow vocabulary",
|
|
24
|
+
"agent orchestration"
|
|
25
|
+
],
|
|
26
|
+
"public_modules": [
|
|
27
|
+
"agent_eval_contract.models",
|
|
28
|
+
"agent_eval_contract.validators",
|
|
29
|
+
"agent_eval_contract.external",
|
|
30
|
+
"agent_eval_contract.schema_export",
|
|
31
|
+
"agent_eval_contract.templates",
|
|
32
|
+
"agent_eval_contract.samples",
|
|
33
|
+
"agent_eval_contract.clean_room",
|
|
34
|
+
"agent_eval_contract.fixture_runner",
|
|
35
|
+
"agent_eval_contract.cli",
|
|
36
|
+
"agent_eval_contract.release"
|
|
37
|
+
],
|
|
38
|
+
"release_blockers": []
|
|
39
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
{
|
|
2
|
+
"failure_id": "failure-login-flow-001",
|
|
3
|
+
"run_id": "run-login-flow-001",
|
|
4
|
+
"failure_types": [
|
|
5
|
+
"verification_gap"
|
|
6
|
+
],
|
|
7
|
+
"summary": "The first run checked the happy path but skipped the failed-auth redirect case.",
|
|
8
|
+
"suspected_cause": "The harness did not include a regression test for unauthenticated navigation.",
|
|
9
|
+
"affected_components": [
|
|
10
|
+
"app/auth.py",
|
|
11
|
+
"tests/test_auth_redirect.py"
|
|
12
|
+
],
|
|
13
|
+
"recommended_fixes": [
|
|
14
|
+
"Add a regression case that starts from a protected nested route.",
|
|
15
|
+
"Record failed command output in the eval run metadata."
|
|
16
|
+
],
|
|
17
|
+
"priority": "medium",
|
|
18
|
+
"regression_task_id": "task-login-flow-001",
|
|
19
|
+
"metadata": {
|
|
20
|
+
"found_by": "review"
|
|
21
|
+
}
|
|
22
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"run_id": "run-login-flow-001",
|
|
3
|
+
"task_id": "task-login-flow-001",
|
|
4
|
+
"harness": "pytest",
|
|
5
|
+
"model": "gpt-5",
|
|
6
|
+
"mode": "autonomous",
|
|
7
|
+
"context_profile": "repo_only",
|
|
8
|
+
"final_status": "success",
|
|
9
|
+
"started_at": "2026-07-04T00:01:00Z",
|
|
10
|
+
"completed_at": "2026-07-04T00:03:05Z",
|
|
11
|
+
"duration_ms": 125000,
|
|
12
|
+
"total_tokens": 8420,
|
|
13
|
+
"estimated_cost_usd": 0.54,
|
|
14
|
+
"tool_calls": 18,
|
|
15
|
+
"failed_steps": 1,
|
|
16
|
+
"files_changed": 4,
|
|
17
|
+
"checks": [
|
|
18
|
+
"pytest tests/test_auth_redirect.py -q",
|
|
19
|
+
"ruff check app tests"
|
|
20
|
+
],
|
|
21
|
+
"output_summary": "Implemented redirect preservation and added regression coverage.",
|
|
22
|
+
"metadata": {
|
|
23
|
+
"branch": "agent/login-redirect"
|
|
24
|
+
}
|
|
25
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"run_id": "run-login-flow-001",
|
|
3
|
+
"overall_score": 0.91,
|
|
4
|
+
"metrics": {
|
|
5
|
+
"task_success": 1.0,
|
|
6
|
+
"quality_adherence": 0.9,
|
|
7
|
+
"verification_strength": 0.85,
|
|
8
|
+
"cost_efficiency": 0.78,
|
|
9
|
+
"human_trust": 1.0
|
|
10
|
+
},
|
|
11
|
+
"passed": true,
|
|
12
|
+
"reviewer_notes": "The agent fixed the bug and added a focused regression test.",
|
|
13
|
+
"metadata": {
|
|
14
|
+
"reviewer": "human"
|
|
15
|
+
}
|
|
16
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"task_id": "task-login-flow-001",
|
|
3
|
+
"title": "Repair login redirect handling",
|
|
4
|
+
"description": "Fix an agent task where a web app should preserve the requested path after sign-in.",
|
|
5
|
+
"source": "benchmark",
|
|
6
|
+
"context_profile": "repo_only",
|
|
7
|
+
"acceptance_criteria": [
|
|
8
|
+
"Unauthenticated users are redirected back to the original path after sign-in.",
|
|
9
|
+
"Existing authenticated navigation keeps working.",
|
|
10
|
+
"The regression test fails before the fix and passes after it."
|
|
11
|
+
],
|
|
12
|
+
"repo": "example/web-app",
|
|
13
|
+
"start_revision": "abc1234",
|
|
14
|
+
"tags": [
|
|
15
|
+
"web",
|
|
16
|
+
"auth",
|
|
17
|
+
"regression"
|
|
18
|
+
],
|
|
19
|
+
"created_at": "2026-07-04T00:00:00Z",
|
|
20
|
+
"metadata": {
|
|
21
|
+
"difficulty": "medium"
|
|
22
|
+
}
|
|
23
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"external_result": {
|
|
3
|
+
"passed": true,
|
|
4
|
+
"tests_run": [
|
|
5
|
+
"pytest tests/test_auth_redirect.py -q"
|
|
6
|
+
],
|
|
7
|
+
"duration_ms": 125000,
|
|
8
|
+
"score": 0.91
|
|
9
|
+
},
|
|
10
|
+
"request": {
|
|
11
|
+
"eval_task_id": "task-login-flow-001",
|
|
12
|
+
"harness": "terminal-bench",
|
|
13
|
+
"model": "gpt-5"
|
|
14
|
+
},
|
|
15
|
+
"expected_normalized": {
|
|
16
|
+
"task_id": "task-login-flow-001",
|
|
17
|
+
"harness": "terminal-bench",
|
|
18
|
+
"model": "gpt-5",
|
|
19
|
+
"mode": "benchmark",
|
|
20
|
+
"context_profile": "clean_room",
|
|
21
|
+
"final_status": "success",
|
|
22
|
+
"checks": [
|
|
23
|
+
"pytest tests/test_auth_redirect.py -q"
|
|
24
|
+
],
|
|
25
|
+
"duration_ms": 125000,
|
|
26
|
+
"score": 0.91,
|
|
27
|
+
"metadata": {
|
|
28
|
+
"source": "terminal-bench",
|
|
29
|
+
"raw": {
|
|
30
|
+
"passed": true,
|
|
31
|
+
"tests_run": [
|
|
32
|
+
"pytest tests/test_auth_redirect.py -q"
|
|
33
|
+
],
|
|
34
|
+
"duration_ms": 125000,
|
|
35
|
+
"score": 0.91
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from .external import normalize_external_result
|
|
8
|
+
from .validators import (
|
|
9
|
+
validate_eval_failure,
|
|
10
|
+
validate_eval_run,
|
|
11
|
+
validate_eval_score,
|
|
12
|
+
validate_eval_task,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
SAMPLE_ROOT = Path(__file__).resolve().parent / "samples"
|
|
16
|
+
SAMPLE_FILES = {
|
|
17
|
+
"eval_task": "eval_task.json",
|
|
18
|
+
"eval_run": "eval_run.json",
|
|
19
|
+
"eval_score": "eval_score.json",
|
|
20
|
+
"eval_failure": "eval_failure.json",
|
|
21
|
+
"external_result_normalization": "external_result_normalization.json",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def load_sample(sample_id: str, *, sample_root: Path = SAMPLE_ROOT) -> dict[str, Any]:
|
|
26
|
+
filename = SAMPLE_FILES.get(sample_id)
|
|
27
|
+
if filename is None:
|
|
28
|
+
allowed = ", ".join(sorted(SAMPLE_FILES))
|
|
29
|
+
raise ValueError(f"Unknown agent eval sample '{sample_id}'. Use one of: {allowed}.")
|
|
30
|
+
loaded = json.loads((sample_root / filename).read_text(encoding="utf-8"))
|
|
31
|
+
if not isinstance(loaded, dict):
|
|
32
|
+
raise ValueError(f"Agent eval sample '{sample_id}' must be a JSON object.")
|
|
33
|
+
return loaded
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def validate_sample(sample_id: str, sample: dict[str, Any]) -> None:
|
|
37
|
+
if sample_id == "eval_task":
|
|
38
|
+
validate_eval_task(sample)
|
|
39
|
+
return
|
|
40
|
+
if sample_id == "eval_run":
|
|
41
|
+
validate_eval_run(sample)
|
|
42
|
+
return
|
|
43
|
+
if sample_id == "eval_score":
|
|
44
|
+
validate_eval_score(sample)
|
|
45
|
+
return
|
|
46
|
+
if sample_id == "eval_failure":
|
|
47
|
+
validate_eval_failure(sample)
|
|
48
|
+
return
|
|
49
|
+
if sample_id == "external_result_normalization":
|
|
50
|
+
_validate_external_result_normalization(sample)
|
|
51
|
+
return
|
|
52
|
+
allowed = ", ".join(sorted(SAMPLE_FILES))
|
|
53
|
+
raise ValueError(f"Unknown agent eval sample '{sample_id}'. Use one of: {allowed}.")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def validate_all_samples(*, sample_root: Path = SAMPLE_ROOT) -> list[str]:
|
|
57
|
+
validated: list[str] = []
|
|
58
|
+
for sample_id in sorted(SAMPLE_FILES):
|
|
59
|
+
validate_sample(sample_id, load_sample(sample_id, sample_root=sample_root))
|
|
60
|
+
validated.append(sample_id)
|
|
61
|
+
return validated
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _validate_external_result_normalization(sample: dict[str, Any]) -> None:
|
|
65
|
+
external_result = sample.get("external_result")
|
|
66
|
+
request = sample.get("request")
|
|
67
|
+
expected = sample.get("expected_normalized")
|
|
68
|
+
if (
|
|
69
|
+
not isinstance(external_result, dict)
|
|
70
|
+
or not isinstance(request, dict)
|
|
71
|
+
or not isinstance(expected, dict)
|
|
72
|
+
):
|
|
73
|
+
raise ValueError("Agent eval external normalization sample fields must be objects.")
|
|
74
|
+
normalized = normalize_external_result(
|
|
75
|
+
external_result,
|
|
76
|
+
eval_task_id=str(request["eval_task_id"]),
|
|
77
|
+
harness=str(request["harness"]),
|
|
78
|
+
model=str(request["model"]),
|
|
79
|
+
)
|
|
80
|
+
if normalized.model_dump(mode="json") != expected:
|
|
81
|
+
raise ValueError("Agent eval external normalization sample expected output does not match.")
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from .models import (
|
|
9
|
+
EvalFailure,
|
|
10
|
+
EvalRun,
|
|
11
|
+
EvalScore,
|
|
12
|
+
EvalTask,
|
|
13
|
+
ExternalResult,
|
|
14
|
+
FixtureBundleManifest,
|
|
15
|
+
NormalizedRun,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
type ModelType = type[BaseModel]
|
|
19
|
+
|
|
20
|
+
SCHEMA_MODELS: dict[str, ModelType] = {
|
|
21
|
+
"eval_failure": EvalFailure,
|
|
22
|
+
"eval_run": EvalRun,
|
|
23
|
+
"eval_score": EvalScore,
|
|
24
|
+
"eval_task": EvalTask,
|
|
25
|
+
"external_result": ExternalResult,
|
|
26
|
+
"fixture_bundle_manifest": FixtureBundleManifest,
|
|
27
|
+
"normalized_run": NormalizedRun,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def export_json_schemas(output_dir: Path) -> list[str]:
|
|
32
|
+
resolved = output_dir.expanduser().resolve()
|
|
33
|
+
resolved.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
exported: list[str] = []
|
|
35
|
+
for schema_id, model in sorted(SCHEMA_MODELS.items()):
|
|
36
|
+
path = resolved / f"{schema_id}.schema.json"
|
|
37
|
+
path.write_text(
|
|
38
|
+
json.dumps(model.model_json_schema(), indent=2, sort_keys=True) + "\n",
|
|
39
|
+
encoding="utf-8",
|
|
40
|
+
)
|
|
41
|
+
exported.append(path.name)
|
|
42
|
+
return exported
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .models import (
|
|
4
|
+
CONTEXT_PROFILES,
|
|
5
|
+
EVAL_RUN_MODES,
|
|
6
|
+
EVAL_TASK_SOURCES,
|
|
7
|
+
EXTERNAL_HARNESSES,
|
|
8
|
+
FAILURE_PRIORITIES,
|
|
9
|
+
FINAL_STATUSES,
|
|
10
|
+
ContextProfile,
|
|
11
|
+
EvalFailure,
|
|
12
|
+
EvalRun,
|
|
13
|
+
EvalRunMode,
|
|
14
|
+
EvalScore,
|
|
15
|
+
EvalTask,
|
|
16
|
+
EvalTaskSource,
|
|
17
|
+
ExternalHarness,
|
|
18
|
+
ExternalResult,
|
|
19
|
+
FailurePriority,
|
|
20
|
+
FinalStatus,
|
|
21
|
+
FixtureBundleManifest,
|
|
22
|
+
JsonValue,
|
|
23
|
+
NormalizedRun,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"CONTEXT_PROFILES",
|
|
28
|
+
"EVAL_RUN_MODES",
|
|
29
|
+
"EVAL_TASK_SOURCES",
|
|
30
|
+
"EXTERNAL_HARNESSES",
|
|
31
|
+
"FAILURE_PRIORITIES",
|
|
32
|
+
"FINAL_STATUSES",
|
|
33
|
+
"ContextProfile",
|
|
34
|
+
"EvalFailure",
|
|
35
|
+
"EvalRun",
|
|
36
|
+
"EvalRunMode",
|
|
37
|
+
"EvalScore",
|
|
38
|
+
"EvalTask",
|
|
39
|
+
"EvalTaskSource",
|
|
40
|
+
"ExternalHarness",
|
|
41
|
+
"ExternalResult",
|
|
42
|
+
"FailurePriority",
|
|
43
|
+
"FinalStatus",
|
|
44
|
+
"FixtureBundleManifest",
|
|
45
|
+
"JsonValue",
|
|
46
|
+
"NormalizedRun",
|
|
47
|
+
]
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
TEMPLATE_SECTIONS: dict[str, tuple[str, ...]] = {
|
|
6
|
+
"major-task-eval": (
|
|
7
|
+
"Task Summary",
|
|
8
|
+
"Original Acceptance Criteria",
|
|
9
|
+
"Context Profile",
|
|
10
|
+
"Harness Condition",
|
|
11
|
+
"Checks Run",
|
|
12
|
+
"Check Results",
|
|
13
|
+
"Failure Taxonomy Labels",
|
|
14
|
+
"Final Confidence Level",
|
|
15
|
+
),
|
|
16
|
+
"failure-record": (
|
|
17
|
+
"Identifiers",
|
|
18
|
+
"Condition",
|
|
19
|
+
"Context Profile",
|
|
20
|
+
"Failure Types",
|
|
21
|
+
"Summary",
|
|
22
|
+
"Priority",
|
|
23
|
+
"Standards or Project Truth Update Needed",
|
|
24
|
+
),
|
|
25
|
+
"shadow-branch-comparison": (
|
|
26
|
+
"Branches",
|
|
27
|
+
"Task Description",
|
|
28
|
+
"Acceptance Criteria",
|
|
29
|
+
"Context Profile",
|
|
30
|
+
"Harness Condition",
|
|
31
|
+
"Test Results",
|
|
32
|
+
"Shadow Branch Delta",
|
|
33
|
+
"Recommendation",
|
|
34
|
+
),
|
|
35
|
+
"backfill-hotspot": (
|
|
36
|
+
"Hotspot",
|
|
37
|
+
"Evidence",
|
|
38
|
+
"Risk",
|
|
39
|
+
"Suggested Fix",
|
|
40
|
+
"Priority",
|
|
41
|
+
"Blocking Status",
|
|
42
|
+
"Related Failure Taxonomy Label",
|
|
43
|
+
),
|
|
44
|
+
"portable-context-packet": (
|
|
45
|
+
"Packet ID",
|
|
46
|
+
"Scope",
|
|
47
|
+
"Task Summary",
|
|
48
|
+
"Included Repo Files/Docs",
|
|
49
|
+
"Excluded Content",
|
|
50
|
+
"Privacy/Secret Review",
|
|
51
|
+
"Expiration/Staleness Notes",
|
|
52
|
+
),
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def supported_template_ids() -> tuple[str, ...]:
|
|
57
|
+
return tuple(sorted(TEMPLATE_SECTIONS))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def render_eval_template(template_id: str) -> str:
|
|
61
|
+
required_sections = TEMPLATE_SECTIONS.get(template_id)
|
|
62
|
+
if required_sections is None:
|
|
63
|
+
allowed = ", ".join(supported_template_ids())
|
|
64
|
+
raise ValueError(f"Unknown eval template id '{template_id}'. Use one of: {allowed}.")
|
|
65
|
+
title = template_id.replace("-", " ").title()
|
|
66
|
+
sections = "\n\n".join(f"## {section}\n\nTODO" for section in required_sections)
|
|
67
|
+
return f"# {title}\n\n{sections}\n"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def validate_eval_template(template_id: str, markdown: str) -> None:
|
|
71
|
+
required_sections = TEMPLATE_SECTIONS.get(template_id)
|
|
72
|
+
if required_sections is None:
|
|
73
|
+
allowed = ", ".join(supported_template_ids())
|
|
74
|
+
raise ValueError(f"Unknown eval template id '{template_id}'. Use one of: {allowed}.")
|
|
75
|
+
if not markdown.strip():
|
|
76
|
+
raise ValueError(f"Eval template '{template_id}' must not be empty.")
|
|
77
|
+
missing = [
|
|
78
|
+
section
|
|
79
|
+
for section in required_sections
|
|
80
|
+
if f"## {section}" not in markdown and f"# {section}" not in markdown
|
|
81
|
+
]
|
|
82
|
+
if missing:
|
|
83
|
+
raise ValueError(
|
|
84
|
+
f"Eval template '{template_id}' is missing required sections: {', '.join(missing)}"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def validate_eval_template_file(path: Path, *, template_id: str | None = None) -> str:
|
|
89
|
+
resolved = path.expanduser().resolve()
|
|
90
|
+
actual_id = template_id or resolved.stem
|
|
91
|
+
validate_eval_template(actual_id, resolved.read_text(encoding="utf-8"))
|
|
92
|
+
return actual_id
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def validate_template_directory(template_root: Path) -> list[str]:
|
|
96
|
+
resolved = template_root.expanduser().resolve()
|
|
97
|
+
validated: list[str] = []
|
|
98
|
+
for template_id in supported_template_ids():
|
|
99
|
+
validate_eval_template_file(resolved / f"{template_id}.md", template_id=template_id)
|
|
100
|
+
validated.append(template_id)
|
|
101
|
+
return validated
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Mapping
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from .models import (
|
|
7
|
+
CONTEXT_PROFILES,
|
|
8
|
+
FAILURE_PRIORITIES,
|
|
9
|
+
FINAL_STATUSES,
|
|
10
|
+
EvalFailure,
|
|
11
|
+
EvalRun,
|
|
12
|
+
EvalScore,
|
|
13
|
+
EvalTask,
|
|
14
|
+
ExternalResult,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
HARNESS_DIMENSION_NAMES = (
|
|
18
|
+
"task_success",
|
|
19
|
+
"quality_adherence",
|
|
20
|
+
"runtime_reliability",
|
|
21
|
+
"context_usefulness",
|
|
22
|
+
"verification_strength",
|
|
23
|
+
"cost_efficiency",
|
|
24
|
+
"latency",
|
|
25
|
+
"human_trust",
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _allowed_message(label: str, allowed: frozenset[str]) -> str:
|
|
30
|
+
return f"Invalid {label}. Use one of: {', '.join(sorted(allowed))}."
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def validate_context_profile(context_profile: str) -> None:
|
|
34
|
+
if context_profile not in CONTEXT_PROFILES:
|
|
35
|
+
raise ValueError(_allowed_message("context_profile", CONTEXT_PROFILES))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def validate_final_status(final_status: str) -> None:
|
|
39
|
+
if final_status not in FINAL_STATUSES:
|
|
40
|
+
raise ValueError(_allowed_message("final_status", FINAL_STATUSES))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def validate_priority(priority: str) -> None:
|
|
44
|
+
if priority not in FAILURE_PRIORITIES:
|
|
45
|
+
raise ValueError(_allowed_message("priority", FAILURE_PRIORITIES))
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def validate_eval_task(data: Mapping[str, Any]) -> EvalTask:
|
|
49
|
+
return EvalTask.model_validate(data)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def validate_eval_run(data: Mapping[str, Any]) -> EvalRun:
|
|
53
|
+
return EvalRun.model_validate(data)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def validate_eval_score(data: Mapping[str, Any]) -> EvalScore:
|
|
57
|
+
return EvalScore.model_validate(data)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def validate_eval_failure(data: Mapping[str, Any]) -> EvalFailure:
|
|
61
|
+
return EvalFailure.model_validate(data)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def validate_external_result(data: Mapping[str, Any]) -> ExternalResult:
|
|
65
|
+
return ExternalResult.model_validate(data)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _is_string_list(value: Any) -> bool:
|
|
69
|
+
return isinstance(value, list) and all(isinstance(item, str) for item in value)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _validate_expected_gates(expected_gates: Any) -> None:
|
|
73
|
+
if not isinstance(expected_gates, list):
|
|
74
|
+
raise ValueError("harness fixture expected_gates must be a list")
|
|
75
|
+
for item in expected_gates:
|
|
76
|
+
if not isinstance(item, Mapping):
|
|
77
|
+
raise ValueError("harness fixture expected_gates entries must be objects")
|
|
78
|
+
if not item.get("id") or not item.get("expected_decision"):
|
|
79
|
+
raise ValueError("harness fixture expected_gates entries need id and expected_decision")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _validate_runs(runs: Any) -> None:
|
|
83
|
+
if not isinstance(runs, Mapping) or not runs:
|
|
84
|
+
raise ValueError("harness fixture runs must be a non-empty object")
|
|
85
|
+
for run_id, run in runs.items():
|
|
86
|
+
if not isinstance(run_id, str) or not run_id:
|
|
87
|
+
raise ValueError("harness fixture run ids must be non-empty strings")
|
|
88
|
+
if not isinstance(run, Mapping):
|
|
89
|
+
raise ValueError("harness fixture run artifacts must be objects")
|
|
90
|
+
harness = run.get("harness")
|
|
91
|
+
if not isinstance(harness, Mapping) or not harness.get("name"):
|
|
92
|
+
raise ValueError("harness fixture run artifacts need harness.name")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def validate_harness_fixture_components(
|
|
96
|
+
*,
|
|
97
|
+
task_markdown: str,
|
|
98
|
+
expected_context_packets: list[str],
|
|
99
|
+
expected_gates: list[dict[str, str]],
|
|
100
|
+
expected_success_criteria: list[str],
|
|
101
|
+
golden_outcome_markdown: str,
|
|
102
|
+
scoring: Mapping[str, Any],
|
|
103
|
+
runs: Mapping[str, Mapping[str, Any]],
|
|
104
|
+
) -> None:
|
|
105
|
+
if not task_markdown.strip():
|
|
106
|
+
raise ValueError("harness fixture task_markdown must not be empty")
|
|
107
|
+
if not golden_outcome_markdown.strip():
|
|
108
|
+
raise ValueError("harness fixture golden_outcome_markdown must not be empty")
|
|
109
|
+
if not _is_string_list(expected_context_packets):
|
|
110
|
+
raise ValueError("harness fixture expected_context_packets must be a string list")
|
|
111
|
+
if not _is_string_list(expected_success_criteria):
|
|
112
|
+
raise ValueError("harness fixture expected_success_criteria must be a string list")
|
|
113
|
+
if not isinstance(scoring, Mapping):
|
|
114
|
+
raise ValueError("harness fixture scoring must be an object")
|
|
115
|
+
_validate_expected_gates(expected_gates)
|
|
116
|
+
_validate_runs(runs)
|