agent-eval-contract 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. agent_eval_contract-0.2.0/CHANGELOG.md +11 -0
  2. agent_eval_contract-0.2.0/LICENSE +21 -0
  3. agent_eval_contract-0.2.0/MANIFEST.in +7 -0
  4. agent_eval_contract-0.2.0/PKG-INFO +107 -0
  5. agent_eval_contract-0.2.0/README.md +82 -0
  6. agent_eval_contract-0.2.0/RELEASE.md +53 -0
  7. agent_eval_contract-0.2.0/agent_eval_contract/__init__.py +104 -0
  8. agent_eval_contract-0.2.0/agent_eval_contract/clean_room.py +27 -0
  9. agent_eval_contract-0.2.0/agent_eval_contract/cli.py +114 -0
  10. agent_eval_contract-0.2.0/agent_eval_contract/external.py +215 -0
  11. agent_eval_contract-0.2.0/agent_eval_contract/fixture_runner.py +82 -0
  12. agent_eval_contract-0.2.0/agent_eval_contract/models.py +157 -0
  13. agent_eval_contract-0.2.0/agent_eval_contract/py.typed +1 -0
  14. agent_eval_contract-0.2.0/agent_eval_contract/release.py +52 -0
  15. agent_eval_contract-0.2.0/agent_eval_contract/release_metadata.json +39 -0
  16. agent_eval_contract-0.2.0/agent_eval_contract/samples/eval_failure.json +22 -0
  17. agent_eval_contract-0.2.0/agent_eval_contract/samples/eval_run.json +25 -0
  18. agent_eval_contract-0.2.0/agent_eval_contract/samples/eval_score.json +16 -0
  19. agent_eval_contract-0.2.0/agent_eval_contract/samples/eval_task.json +23 -0
  20. agent_eval_contract-0.2.0/agent_eval_contract/samples/external_result_normalization.json +39 -0
  21. agent_eval_contract-0.2.0/agent_eval_contract/samples.py +81 -0
  22. agent_eval_contract-0.2.0/agent_eval_contract/schema_export.py +42 -0
  23. agent_eval_contract-0.2.0/agent_eval_contract/schemas.py +47 -0
  24. agent_eval_contract-0.2.0/agent_eval_contract/templates.py +101 -0
  25. agent_eval_contract-0.2.0/agent_eval_contract/validators.py +116 -0
  26. agent_eval_contract-0.2.0/agent_eval_contract.egg-info/PKG-INFO +107 -0
  27. agent_eval_contract-0.2.0/agent_eval_contract.egg-info/SOURCES.txt +38 -0
  28. agent_eval_contract-0.2.0/agent_eval_contract.egg-info/dependency_links.txt +1 -0
  29. agent_eval_contract-0.2.0/agent_eval_contract.egg-info/entry_points.txt +3 -0
  30. agent_eval_contract-0.2.0/agent_eval_contract.egg-info/requires.txt +1 -0
  31. agent_eval_contract-0.2.0/agent_eval_contract.egg-info/top_level.txt +1 -0
  32. agent_eval_contract-0.2.0/docs/adapters.md +56 -0
  33. agent_eval_contract-0.2.0/docs/contract.md +45 -0
  34. agent_eval_contract-0.2.0/docs/field-reference.md +73 -0
  35. agent_eval_contract-0.2.0/examples/eval_run.json +15 -0
  36. agent_eval_contract-0.2.0/examples/swe_bench_result.json +13 -0
  37. agent_eval_contract-0.2.0/examples/terminal_bench_result.json +8 -0
  38. agent_eval_contract-0.2.0/pyproject.toml +73 -0
  39. agent_eval_contract-0.2.0/setup.cfg +4 -0
  40. agent_eval_contract-0.2.0/tests/test_agent_eval_contract.py +364 -0
@@ -0,0 +1,11 @@
1
+ # Changelog
2
+
3
+ ## 0.2.0 - 2026-07-04
4
+
5
+ - Reworked the package into a public Pydantic contract library for agent evaluation records.
6
+ - Added typed models, runtime validators, JSON Schema export, CLI subcommands, public examples, and fixture bundles.
7
+ - Removed private workflow vocabulary from the public core.
8
+
9
+ ## 0.1.0 - 2026-06-27
10
+
11
+ - Initial internal package extraction.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jakye Amos
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,7 @@
1
+ include LICENSE
2
+ include README.md
3
+ include RELEASE.md
4
+ include CHANGELOG.md
5
+ recursive-include docs *.md
6
+ recursive-include examples *.json
7
+ recursive-include tests *.py
@@ -0,0 +1,107 @@
1
+ Metadata-Version: 2.4
2
+ Name: agent-eval-contract
3
+ Version: 0.2.0
4
+ Summary: Pydantic contracts and JSON Schemas for portable agent evaluation records.
5
+ Author: Jakye Amos
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/jakyeamos/agent-eval-contract
8
+ Project-URL: Repository, https://github.com/jakyeamos/agent-eval-contract
9
+ Project-URL: Issues, https://github.com/jakyeamos/agent-eval-contract/issues
10
+ Keywords: agents,ai-evaluation,evals,harness,json-schema,pydantic
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Programming Language :: Python :: 3.14
17
+ Classifier: Topic :: Software Development :: Quality Assurance
18
+ Classifier: Topic :: Software Development :: Testing
19
+ Classifier: Typing :: Typed
20
+ Requires-Python: >=3.12
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: pydantic<3,>=2
24
+ Dynamic: license-file
25
+
26
+ # Agent Eval Contract
27
+
28
+ Pydantic contracts and JSON Schemas for portable agent evaluation records.
29
+
30
+ Use this package when you are experimenting with agents, harnesses, CI checks, or benchmark runners and need a stable record shape for tasks, runs, scores, failures, and normalized external results. It does not run evaluations, call model providers, store dashboards, or orchestrate agents. It gives those tools a shared contract.
31
+
32
+ ## Install
33
+
34
+ ```bash
35
+ pip install agent-eval-contract
36
+ ```
37
+
38
+ For local development from this repo:
39
+
40
+ ```bash
41
+ uv sync --dev
42
+ ```
43
+
44
+ ## Validate A Record
45
+
46
+ ```python
47
+ from agent_eval_contract import validate_eval_run
48
+
49
+ run = validate_eval_run(
50
+ {
51
+ "run_id": "run-login-flow-001",
52
+ "task_id": "task-login-flow-001",
53
+ "harness": "pytest",
54
+ "model": "gpt-5",
55
+ "mode": "autonomous",
56
+ "context_profile": "repo_only",
57
+ "final_status": "success",
58
+ "checks": ["pytest tests/test_auth_redirect.py -q"],
59
+ }
60
+ )
61
+
62
+ print(run.model_dump(mode="json"))
63
+ ```
64
+
65
+ Validation returns typed Pydantic model instances. Invalid records raise `pydantic.ValidationError` with structured field errors.
66
+
67
+ ## CLI
68
+
69
+ ```bash
70
+ agent-eval-contract validate --kind run --file examples/eval_run.json
71
+ agent-eval-contract schemas --output-dir /tmp/agent-eval-contract-schemas
72
+ agent-eval-contract fixtures --output-dir /tmp/agent-eval-contract-fixtures
73
+ agent-eval-contract normalize --harness terminal-bench --file examples/terminal_bench_result.json --task-id task-login-flow-001 --model gpt-5
74
+ agent-eval-contract normalize --harness swe-bench --file examples/swe_bench_result.json
75
+ ```
76
+
77
+ The legacy `agent-eval-contract-fixtures` command still writes fixture bundles for one release.
78
+
79
+ ## What It Provides
80
+
81
+ - Pydantic models for eval tasks, runs, scores, failures, external results, normalized runs, and fixture manifests
82
+ - runtime validators that return typed model instances
83
+ - JSON Schema export for all public models
84
+ - bundled sample records and markdown templates
85
+ - Terminal-Bench and SWE-bench oriented normalization helpers
86
+ - a small CLI for validation, schema export, fixture generation, and normalization
87
+
88
+ ## Contract Vocabulary
89
+
90
+ The public core uses generic vocabulary only. Project-specific concepts should live in `metadata` or a separate adapter package.
91
+
92
+ - `context_profile`: `repo_only`, `provided_context`, `clean_room`, `tool_augmented`, `full_workspace`
93
+ - `source`: `manual`, `ci`, `benchmark`, `production_trace`, `synthetic`
94
+ - `mode`: `interactive`, `autonomous`, `shadow`, `replay`, `benchmark`
95
+ - `final_status`: `success`, `partial`, `failed`, `abandoned`, `error`
96
+
97
+ See [docs/contract.md](docs/contract.md), [docs/field-reference.md](docs/field-reference.md), and [docs/adapters.md](docs/adapters.md) for the model contract and adapter guidance.
98
+
99
+ ## Development
100
+
101
+ ```bash
102
+ uv run ruff check agent_eval_contract tests
103
+ uv run ruff format --check agent_eval_contract tests
104
+ uv run basedpyright agent_eval_contract tests
105
+ uv run pytest -q
106
+ uv build --out-dir /tmp/agent-eval-contract-dist
107
+ ```
@@ -0,0 +1,82 @@
1
+ # Agent Eval Contract
2
+
3
+ Pydantic contracts and JSON Schemas for portable agent evaluation records.
4
+
5
+ Use this package when you are experimenting with agents, harnesses, CI checks, or benchmark runners and need a stable record shape for tasks, runs, scores, failures, and normalized external results. It does not run evaluations, call model providers, store dashboards, or orchestrate agents. It gives those tools a shared contract.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install agent-eval-contract
11
+ ```
12
+
13
+ For local development from this repo:
14
+
15
+ ```bash
16
+ uv sync --dev
17
+ ```
18
+
19
+ ## Validate A Record
20
+
21
+ ```python
22
+ from agent_eval_contract import validate_eval_run
23
+
24
+ run = validate_eval_run(
25
+ {
26
+ "run_id": "run-login-flow-001",
27
+ "task_id": "task-login-flow-001",
28
+ "harness": "pytest",
29
+ "model": "gpt-5",
30
+ "mode": "autonomous",
31
+ "context_profile": "repo_only",
32
+ "final_status": "success",
33
+ "checks": ["pytest tests/test_auth_redirect.py -q"],
34
+ }
35
+ )
36
+
37
+ print(run.model_dump(mode="json"))
38
+ ```
39
+
40
+ Validation returns typed Pydantic model instances. Invalid records raise `pydantic.ValidationError` with structured field errors.
41
+
42
+ ## CLI
43
+
44
+ ```bash
45
+ agent-eval-contract validate --kind run --file examples/eval_run.json
46
+ agent-eval-contract schemas --output-dir /tmp/agent-eval-contract-schemas
47
+ agent-eval-contract fixtures --output-dir /tmp/agent-eval-contract-fixtures
48
+ agent-eval-contract normalize --harness terminal-bench --file examples/terminal_bench_result.json --task-id task-login-flow-001 --model gpt-5
49
+ agent-eval-contract normalize --harness swe-bench --file examples/swe_bench_result.json
50
+ ```
51
+
52
+ The legacy `agent-eval-contract-fixtures` command still writes fixture bundles for one release.
53
+
54
+ ## What It Provides
55
+
56
+ - Pydantic models for eval tasks, runs, scores, failures, external results, normalized runs, and fixture manifests
57
+ - runtime validators that return typed model instances
58
+ - JSON Schema export for all public models
59
+ - bundled sample records and markdown templates
60
+ - Terminal-Bench and SWE-bench oriented normalization helpers
61
+ - a small CLI for validation, schema export, fixture generation, and normalization
62
+
63
+ ## Contract Vocabulary
64
+
65
+ The public core uses generic vocabulary only. Project-specific concepts should live in `metadata` or a separate adapter package.
66
+
67
+ - `context_profile`: `repo_only`, `provided_context`, `clean_room`, `tool_augmented`, `full_workspace`
68
+ - `source`: `manual`, `ci`, `benchmark`, `production_trace`, `synthetic`
69
+ - `mode`: `interactive`, `autonomous`, `shadow`, `replay`, `benchmark`
70
+ - `final_status`: `success`, `partial`, `failed`, `abandoned`, `error`
71
+
72
+ See [docs/contract.md](docs/contract.md), [docs/field-reference.md](docs/field-reference.md), and [docs/adapters.md](docs/adapters.md) for the model contract and adapter guidance.
73
+
74
+ ## Development
75
+
76
+ ```bash
77
+ uv run ruff check agent_eval_contract tests
78
+ uv run ruff format --check agent_eval_contract tests
79
+ uv run basedpyright agent_eval_contract tests
80
+ uv run pytest -q
81
+ uv build --out-dir /tmp/agent-eval-contract-dist
82
+ ```
@@ -0,0 +1,53 @@
1
+ # Release
2
+
3
+ ## Current Version
4
+
5
+ - Version: `0.2.0`
6
+ - Contract version: `0.1`
7
+ - Status: public package release
8
+ - Source package: `agent_eval_contract`
9
+
10
+ ## Public Promise
11
+
12
+ `agent-eval-contract` defines, validates, serializes, exports JSON Schema for, and normalizes portable agent evaluation records.
13
+
14
+ ## Release Checks
15
+
16
+ CI runs the same quality ladder on pull requests and pushes. Run this local block before tagging or publishing:
17
+
18
+ ```bash
19
+ uv run ruff check agent_eval_contract tests
20
+ uv run ruff format --check agent_eval_contract tests
21
+ uv run basedpyright agent_eval_contract tests
22
+ uv run pytest -q
23
+ uv build --out-dir /tmp/agent-eval-contract-dist
24
+ ```
25
+
26
+ Then install the wheel in a temp virtualenv and smoke test:
27
+
28
+ ```bash
29
+ python -m venv /tmp/agent-eval-contract-venv
30
+ /tmp/agent-eval-contract-venv/bin/pip install /tmp/agent-eval-contract-dist/agent_eval_contract-0.2.0-py3-none-any.whl
31
+ /tmp/agent-eval-contract-venv/bin/agent-eval-contract validate --kind run --file examples/eval_run.json
32
+ /tmp/agent-eval-contract-venv/bin/agent-eval-contract schemas --output-dir /tmp/agent-eval-contract-schemas
33
+ /tmp/agent-eval-contract-venv/bin/agent-eval-contract normalize --harness swe-bench --file examples/swe_bench_result.json
34
+ ```
35
+
36
+ ## Boundaries
37
+
38
+ Public core:
39
+
40
+ - Pydantic record models
41
+ - runtime validation helpers
42
+ - JSON Schema export
43
+ - external harness normalization
44
+ - fixture bundle generation
45
+ - CLI validation and schema export
46
+
47
+ Out of scope:
48
+
49
+ - evaluation execution
50
+ - model provider calls
51
+ - dashboard storage
52
+ - private workflow vocabulary
53
+ - agent orchestration
@@ -0,0 +1,104 @@
1
+ from __future__ import annotations
2
+
3
+ from .clean_room import run_clean_room_contract_check
4
+ from .external import (
5
+ normalize_external_result,
6
+ normalize_swe_bench_result,
7
+ normalize_terminal_bench_result,
8
+ to_swe_bench_format,
9
+ to_terminal_bench_format,
10
+ )
11
+ from .models import (
12
+ CONTEXT_PROFILES,
13
+ EVAL_RUN_MODES,
14
+ EVAL_TASK_SOURCES,
15
+ EXTERNAL_HARNESSES,
16
+ FAILURE_PRIORITIES,
17
+ FINAL_STATUSES,
18
+ ContextProfile,
19
+ EvalFailure,
20
+ EvalRun,
21
+ EvalRunMode,
22
+ EvalScore,
23
+ EvalTask,
24
+ EvalTaskSource,
25
+ ExternalHarness,
26
+ ExternalResult,
27
+ FailurePriority,
28
+ FinalStatus,
29
+ FixtureBundleManifest,
30
+ JsonValue,
31
+ NormalizedRun,
32
+ )
33
+ from .release import load_release_metadata, validate_release_metadata
34
+ from .samples import load_sample, validate_all_samples, validate_sample
35
+ from .schema_export import export_json_schemas
36
+ from .templates import (
37
+ render_eval_template,
38
+ supported_template_ids,
39
+ validate_eval_template,
40
+ validate_eval_template_file,
41
+ validate_template_directory,
42
+ )
43
+ from .validators import (
44
+ HARNESS_DIMENSION_NAMES,
45
+ validate_context_profile,
46
+ validate_eval_failure,
47
+ validate_eval_run,
48
+ validate_eval_score,
49
+ validate_eval_task,
50
+ validate_external_result,
51
+ validate_final_status,
52
+ validate_harness_fixture_components,
53
+ validate_priority,
54
+ )
55
+
56
+ __all__ = [
57
+ "CONTEXT_PROFILES",
58
+ "EVAL_RUN_MODES",
59
+ "EVAL_TASK_SOURCES",
60
+ "EXTERNAL_HARNESSES",
61
+ "FAILURE_PRIORITIES",
62
+ "FINAL_STATUSES",
63
+ "HARNESS_DIMENSION_NAMES",
64
+ "ContextProfile",
65
+ "EvalFailure",
66
+ "EvalRun",
67
+ "EvalRunMode",
68
+ "EvalScore",
69
+ "EvalTask",
70
+ "EvalTaskSource",
71
+ "ExternalHarness",
72
+ "ExternalResult",
73
+ "FailurePriority",
74
+ "FinalStatus",
75
+ "FixtureBundleManifest",
76
+ "JsonValue",
77
+ "NormalizedRun",
78
+ "export_json_schemas",
79
+ "load_release_metadata",
80
+ "load_sample",
81
+ "normalize_external_result",
82
+ "normalize_swe_bench_result",
83
+ "normalize_terminal_bench_result",
84
+ "render_eval_template",
85
+ "run_clean_room_contract_check",
86
+ "supported_template_ids",
87
+ "to_swe_bench_format",
88
+ "to_terminal_bench_format",
89
+ "validate_all_samples",
90
+ "validate_context_profile",
91
+ "validate_eval_failure",
92
+ "validate_eval_run",
93
+ "validate_eval_score",
94
+ "validate_eval_task",
95
+ "validate_eval_template",
96
+ "validate_eval_template_file",
97
+ "validate_external_result",
98
+ "validate_final_status",
99
+ "validate_harness_fixture_components",
100
+ "validate_priority",
101
+ "validate_release_metadata",
102
+ "validate_sample",
103
+ "validate_template_directory",
104
+ ]
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from .samples import validate_all_samples
7
+ from .templates import validate_template_directory
8
+
9
+
10
+ def run_clean_room_contract_check(
11
+ *,
12
+ template_root: Path,
13
+ sample_root: Path | None = None,
14
+ ) -> dict[str, Any]:
15
+ templates = validate_template_directory(template_root)
16
+ samples = (
17
+ validate_all_samples()
18
+ if sample_root is None
19
+ else validate_all_samples(sample_root=sample_root)
20
+ )
21
+ return {
22
+ "ok": True,
23
+ "template_count": len(templates),
24
+ "sample_count": len(samples),
25
+ "templates": templates,
26
+ "samples": samples,
27
+ }
@@ -0,0 +1,114 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from pydantic import BaseModel
9
+
10
+ from .external import normalize_external_result
11
+ from .fixture_runner import write_contract_fixture_bundle
12
+ from .schema_export import export_json_schemas
13
+ from .validators import (
14
+ validate_eval_failure,
15
+ validate_eval_run,
16
+ validate_eval_score,
17
+ validate_eval_task,
18
+ )
19
+
20
+
21
+ def _load_json(path: Path) -> dict[str, Any]:
22
+ loaded = json.loads(path.expanduser().resolve().read_text(encoding="utf-8"))
23
+ if not isinstance(loaded, dict):
24
+ raise ValueError(f"{path} must contain a JSON object")
25
+ return loaded
26
+
27
+
28
+ def _print_json(value: BaseModel | dict[str, Any] | list[str]) -> None:
29
+ payload = value.model_dump(mode="json") if isinstance(value, BaseModel) else value
30
+ print(json.dumps(payload, indent=2, sort_keys=True))
31
+
32
+
33
+ def _run_fixtures(args: argparse.Namespace) -> int:
34
+ _print_json(write_contract_fixture_bundle(Path(args.output_dir)))
35
+ return 0
36
+
37
+
38
+ def _run_schemas(args: argparse.Namespace) -> int:
39
+ _print_json({"schemas": export_json_schemas(Path(args.output_dir))})
40
+ return 0
41
+
42
+
43
+ def _run_validate(args: argparse.Namespace) -> int:
44
+ data = _load_json(Path(args.file))
45
+ validators = {
46
+ "task": validate_eval_task,
47
+ "run": validate_eval_run,
48
+ "score": validate_eval_score,
49
+ "failure": validate_eval_failure,
50
+ }
51
+ _print_json(validators[args.kind](data))
52
+ return 0
53
+
54
+
55
+ def _run_normalize(args: argparse.Namespace) -> int:
56
+ data = _load_json(Path(args.file))
57
+ normalized = normalize_external_result(
58
+ data,
59
+ eval_task_id=args.task_id,
60
+ harness=args.harness,
61
+ model=args.model,
62
+ )
63
+ _print_json(normalized)
64
+ return 0
65
+
66
+
67
+ def build_parser() -> argparse.ArgumentParser:
68
+ parser = argparse.ArgumentParser(
69
+ prog="agent-eval-contract",
70
+ description="Validate, normalize, and export portable agent evaluation contracts.",
71
+ )
72
+ subparsers = parser.add_subparsers(dest="command", required=True)
73
+
74
+ fixtures = subparsers.add_parser(
75
+ "fixtures", help="Write sample records, templates, and schemas."
76
+ )
77
+ fixtures.add_argument(
78
+ "--output-dir", required=True, help="Directory to write fixture artifacts into."
79
+ )
80
+ fixtures.set_defaults(func=_run_fixtures)
81
+
82
+ schemas = subparsers.add_parser(
83
+ "schemas", help="Export JSON Schemas for public contract models."
84
+ )
85
+ schemas.add_argument(
86
+ "--output-dir", required=True, help="Directory to write schema files into."
87
+ )
88
+ schemas.set_defaults(func=_run_schemas)
89
+
90
+ validate = subparsers.add_parser(
91
+ "validate", help="Validate a JSON record against a contract model."
92
+ )
93
+ validate.add_argument("--kind", choices=("task", "run", "score", "failure"), required=True)
94
+ validate.add_argument("--file", required=True, help="JSON file to validate.")
95
+ validate.set_defaults(func=_run_validate)
96
+
97
+ normalize = subparsers.add_parser("normalize", help="Normalize external harness output.")
98
+ normalize.add_argument("--harness", choices=("terminal-bench", "swe-bench"), required=True)
99
+ normalize.add_argument("--file", required=True, help="External result JSON file to normalize.")
100
+ normalize.add_argument("--task-id", help="Task id to use when the external result omits one.")
101
+ normalize.add_argument("--model", help="Model name to use when the external result omits one.")
102
+ normalize.set_defaults(func=_run_normalize)
103
+
104
+ return parser
105
+
106
+
107
+ def main(argv: list[str] | None = None) -> int:
108
+ parser = build_parser()
109
+ args = parser.parse_args(argv)
110
+ return int(args.func(args))
111
+
112
+
113
+ if __name__ == "__main__":
114
+ raise SystemExit(main())