graded 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graded-1.0.0/PKG-INFO +133 -0
- graded-1.0.0/README.md +120 -0
- graded-1.0.0/pyproject.toml +23 -0
- graded-1.0.0/setup.cfg +4 -0
- graded-1.0.0/src/graded/__init__.py +4 -0
- graded-1.0.0/src/graded/evaluator.py +329 -0
- graded-1.0.0/src/graded/types.py +145 -0
- graded-1.0.0/src/graded.egg-info/PKG-INFO +133 -0
- graded-1.0.0/src/graded.egg-info/SOURCES.txt +14 -0
- graded-1.0.0/src/graded.egg-info/dependency_links.txt +1 -0
- graded-1.0.0/src/graded.egg-info/requires.txt +4 -0
- graded-1.0.0/src/graded.egg-info/top_level.txt +1 -0
- graded-1.0.0/tests/test_evaluator_artifacts.py +131 -0
- graded-1.0.0/tests/test_evaluator_io.py +58 -0
- graded-1.0.0/tests/test_evaluator_llm.py +64 -0
- graded-1.0.0/tests/test_evaluator_scoring.py +152 -0
graded-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: graded
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Defensive verifier framework and helpers for Harbor evaluations
|
|
5
|
+
Classifier: Programming Language :: Python :: 3
|
|
6
|
+
Classifier: Operating System :: OS Independent
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: pydantic>=2.0
|
|
10
|
+
Requires-Dist: instructor>=1.0.0
|
|
11
|
+
Requires-Dist: jsonref>=1.1.0
|
|
12
|
+
Requires-Dist: google-genai>=1.47.0
|
|
13
|
+
|
|
14
|
+
# graded 🍳
|
|
15
|
+
|
|
16
|
+
`graded` is a defensive verifier and grading framework designed for agent evaluations (particularly for Harbor agent evaluations). It allows you to declare structured grading criteria, leverage LLM judges with automatic tracing, and safely manage evaluation artifacts.
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
Install `graded` directly from PyPI (or your internal registry):
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install graded
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Or with `uv`:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
uv pip install graded
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Quick Start
|
|
37
|
+
|
|
38
|
+
Create an evaluation script (e.g. `verify.py`) to grade a task workspace:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from pathlib import Path
|
|
42
|
+
from graded import Evaluator
|
|
43
|
+
|
|
44
|
+
# Initialize the evaluator
|
|
45
|
+
ev = Evaluator(
|
|
46
|
+
workspace="/workspace",
|
|
47
|
+
output_path="/logs/verifier/reward.json",
|
|
48
|
+
auto_save_artifacts=True
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# 1. Declare a standard criterion
|
|
52
|
+
@ev.criterion(name="has_output_file", weight=1.0)
|
|
53
|
+
def check_output(workspace: Path) -> bool:
|
|
54
|
+
return (workspace / "output.txt").is_file()
|
|
55
|
+
|
|
56
|
+
# 2. Declare a fatal criterion (short-circuits score to 0.0 if failed)
|
|
57
|
+
@ev.criterion(name="no_syntax_errors", weight=2.0, fatal=True)
|
|
58
|
+
def check_syntax(workspace: Path) -> bool:
|
|
59
|
+
# return True or False (or float 0.0 - 1.0)
|
|
60
|
+
return True
|
|
61
|
+
|
|
62
|
+
# 3. Declare a fractional scoring criterion
|
|
63
|
+
@ev.criterion(name="test_pass_rate", weight=3.0)
|
|
64
|
+
def check_tests(workspace: Path) -> float:
|
|
65
|
+
# Returns a score between 0.0 and 1.0
|
|
66
|
+
return 0.8 # e.g., 80% of tests passed
|
|
67
|
+
|
|
68
|
+
# Run the evaluation and write outputs
|
|
69
|
+
if __name__ == "__main__":
|
|
70
|
+
ev.run()
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Core Features
|
|
76
|
+
|
|
77
|
+
### 1. Criteria Declarations (`@ev.criterion`)
|
|
78
|
+
Define check functions using the `@ev.criterion` decorator.
|
|
79
|
+
- **`name`**: The unique identifier for the criterion.
|
|
80
|
+
- **`weight`**: Relative weight of the score in the final weighted average calculation.
|
|
81
|
+
- **`fatal`**: If set to `True`, any score of `0.0` or `False` immediately short-circuits the final score to `0.0`.
|
|
82
|
+
- **Return Value**: Must return a `bool`, `int`, or `float`. Anything else raises a `ValueError`.
|
|
83
|
+
|
|
84
|
+
### 2. LLM Judge with Automatic Tracing
|
|
85
|
+
`graded` integrates with `instructor` to run structured, schema-validated LLM grading prompts, automatically logging prompt, parameters, response schema, and LLM responses to `traces.json`.
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from pydantic import BaseModel, Field
|
|
89
|
+
|
|
90
|
+
class Rubric(BaseModel):
|
|
91
|
+
score: float = Field(description="Score between 0.0 and 1.0 based on correctness.")
|
|
92
|
+
reasoning: str = Field(description="Detailed reasoning for the score.")
|
|
93
|
+
|
|
94
|
+
# In your criterion:
|
|
95
|
+
result = ev.llm_judge(
|
|
96
|
+
model="google/gemini-3.5-flash",
|
|
97
|
+
response_model=Rubric,
|
|
98
|
+
system="You are a strict code correctness evaluator.",
|
|
99
|
+
prompt="Compare the student's solution in code.py with the requirements...",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
print(f"LLM Score: {result.score}")
|
|
103
|
+
print(f"Reasoning: {result.reasoning}")
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### 3. File & Artifact Management
|
|
107
|
+
Safely access files and copy evaluation artifacts to the logs directory for post-evaluation review.
|
|
108
|
+
|
|
109
|
+
- **`ev.read_file(filename)`**: Safely reads content as a string. Auto-saves a copy to artifacts.
|
|
110
|
+
- **`ev.load_json(filename)`**: Safely parses JSON file content. Auto-saves a copy to artifacts.
|
|
111
|
+
- **`ev.save_file(filename, content)`**: Save arbitrary text/data to the artifacts directory.
|
|
112
|
+
- **`ev.save_dir(dirname)`**: Copy an entire directory from the workspace to the artifacts directory.
|
|
113
|
+
- **`ev.load_trajectory(path)`**: Load and parse an agent's ATIF `trajectory.json` file into a typed `Trajectory` object.
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## Outputs
|
|
118
|
+
|
|
119
|
+
When `ev.run()` completes, the following files are written to the directory containing your configured `output_path`:
|
|
120
|
+
|
|
121
|
+
1. **`reward.json`**: A flat JSON dictionary containing the final calculated `reward` and the individual scores for each criterion:
|
|
122
|
+
```json
|
|
123
|
+
{
|
|
124
|
+
"reward": 0.75,
|
|
125
|
+
"has_output_file": 1.0,
|
|
126
|
+
"no_syntax_errors": 1.0,
|
|
127
|
+
"test_pass_rate": 0.8
|
|
128
|
+
}
|
|
129
|
+
```
|
|
130
|
+
2. **`reward.txt`**: A text file containing just the final reward float value (e.g. `0.7500\n`).
|
|
131
|
+
3. **`traces.json`**: A list of structured LLM calls made via `ev.llm_judge`, detailing inputs, responses, latencies, and metadata.
|
|
132
|
+
4. **`metadata.json`**: (Optional) Contains evaluator-level and run-level metadata.
|
|
133
|
+
5. **`artifacts/`**: Subfolder containing copy-back files preserved during the evaluation run.
|
graded-1.0.0/README.md
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# graded 🍳
|
|
2
|
+
|
|
3
|
+
`graded` is a defensive verifier and grading framework designed for agent evaluations (particularly for Harbor agent evaluations). It allows you to declare structured grading criteria, leverage LLM judges with automatic tracing, and safely manage evaluation artifacts.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
Install `graded` directly from PyPI (or your internal registry):
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install graded
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Or with `uv`:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
uv pip install graded
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
Create an evaluation script (e.g. `verify.py`) to grade a task workspace:
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from graded import Evaluator
|
|
30
|
+
|
|
31
|
+
# Initialize the evaluator
|
|
32
|
+
ev = Evaluator(
|
|
33
|
+
workspace="/workspace",
|
|
34
|
+
output_path="/logs/verifier/reward.json",
|
|
35
|
+
auto_save_artifacts=True
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# 1. Declare a standard criterion
|
|
39
|
+
@ev.criterion(name="has_output_file", weight=1.0)
|
|
40
|
+
def check_output(workspace: Path) -> bool:
|
|
41
|
+
return (workspace / "output.txt").is_file()
|
|
42
|
+
|
|
43
|
+
# 2. Declare a fatal criterion (short-circuits score to 0.0 if failed)
|
|
44
|
+
@ev.criterion(name="no_syntax_errors", weight=2.0, fatal=True)
|
|
45
|
+
def check_syntax(workspace: Path) -> bool:
|
|
46
|
+
# return True or False (or float 0.0 - 1.0)
|
|
47
|
+
return True
|
|
48
|
+
|
|
49
|
+
# 3. Declare a fractional scoring criterion
|
|
50
|
+
@ev.criterion(name="test_pass_rate", weight=3.0)
|
|
51
|
+
def check_tests(workspace: Path) -> float:
|
|
52
|
+
# Returns a score between 0.0 and 1.0
|
|
53
|
+
return 0.8 # e.g., 80% of tests passed
|
|
54
|
+
|
|
55
|
+
# Run the evaluation and write outputs
|
|
56
|
+
if __name__ == "__main__":
|
|
57
|
+
ev.run()
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Core Features
|
|
63
|
+
|
|
64
|
+
### 1. Criteria Declarations (`@ev.criterion`)
|
|
65
|
+
Define check functions using the `@ev.criterion` decorator.
|
|
66
|
+
- **`name`**: The unique identifier for the criterion.
|
|
67
|
+
- **`weight`**: Relative weight of the score in the final weighted average calculation.
|
|
68
|
+
- **`fatal`**: If set to `True`, any score of `0.0` or `False` immediately short-circuits the final score to `0.0`.
|
|
69
|
+
- **Return Value**: Must return a `bool`, `int`, or `float`. Anything else raises a `ValueError`.
|
|
70
|
+
|
|
71
|
+
### 2. LLM Judge with Automatic Tracing
|
|
72
|
+
`graded` integrates with `instructor` to run structured, schema-validated LLM grading prompts, automatically logging prompt, parameters, response schema, and LLM responses to `traces.json`.
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from pydantic import BaseModel, Field
|
|
76
|
+
|
|
77
|
+
class Rubric(BaseModel):
|
|
78
|
+
score: float = Field(description="Score between 0.0 and 1.0 based on correctness.")
|
|
79
|
+
reasoning: str = Field(description="Detailed reasoning for the score.")
|
|
80
|
+
|
|
81
|
+
# In your criterion:
|
|
82
|
+
result = ev.llm_judge(
|
|
83
|
+
model="google/gemini-3.5-flash",
|
|
84
|
+
response_model=Rubric,
|
|
85
|
+
system="You are a strict code correctness evaluator.",
|
|
86
|
+
prompt="Compare the student's solution in code.py with the requirements...",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
print(f"LLM Score: {result.score}")
|
|
90
|
+
print(f"Reasoning: {result.reasoning}")
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### 3. File & Artifact Management
|
|
94
|
+
Safely access files and copy evaluation artifacts to the logs directory for post-evaluation review.
|
|
95
|
+
|
|
96
|
+
- **`ev.read_file(filename)`**: Safely reads content as a string. Auto-saves a copy to artifacts.
|
|
97
|
+
- **`ev.load_json(filename)`**: Safely parses JSON file content. Auto-saves a copy to artifacts.
|
|
98
|
+
- **`ev.save_file(filename, content)`**: Save arbitrary text/data to the artifacts directory.
|
|
99
|
+
- **`ev.save_dir(dirname)`**: Copy an entire directory from the workspace to the artifacts directory.
|
|
100
|
+
- **`ev.load_trajectory(path)`**: Load and parse an agent's ATIF `trajectory.json` file into a typed `Trajectory` object.
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## Outputs
|
|
105
|
+
|
|
106
|
+
When `ev.run()` completes, the following files are written to the directory containing your configured `output_path`:
|
|
107
|
+
|
|
108
|
+
1. **`reward.json`**: A flat JSON dictionary containing the final calculated `reward` and the individual scores for each criterion:
|
|
109
|
+
```json
|
|
110
|
+
{
|
|
111
|
+
"reward": 0.75,
|
|
112
|
+
"has_output_file": 1.0,
|
|
113
|
+
"no_syntax_errors": 1.0,
|
|
114
|
+
"test_pass_rate": 0.8
|
|
115
|
+
}
|
|
116
|
+
```
|
|
117
|
+
2. **`reward.txt`**: A text file containing just the final reward float value (e.g. `0.7500\n`).
|
|
118
|
+
3. **`traces.json`**: A list of structured LLM calls made via `ev.llm_judge`, detailing inputs, responses, latencies, and metadata.
|
|
119
|
+
4. **`metadata.json`**: (Optional) Contains evaluator-level and run-level metadata.
|
|
120
|
+
5. **`artifacts/`**: Subfolder containing copy-back files preserved during the evaluation run.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "graded"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Defensive verifier framework and helpers for Harbor evaluations"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Programming Language :: Python :: 3",
|
|
13
|
+
"Operating System :: OS Independent",
|
|
14
|
+
]
|
|
15
|
+
dependencies = [
|
|
16
|
+
"pydantic>=2.0",
|
|
17
|
+
"instructor>=1.0.0",
|
|
18
|
+
"jsonref>=1.1.0",
|
|
19
|
+
"google-genai>=1.47.0",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[tool.setuptools.packages.find]
|
|
23
|
+
where = ["src"]
|
graded-1.0.0/setup.cfg
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import shutil
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, Any, Dict, List, Optional, Type, Union
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from graded.types import Criterion, Trajectory
|
|
10
|
+
|
|
11
|
+
logging.basicConfig(
|
|
12
|
+
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Evaluator:
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
workspace: Union[str, Path] = "/workspace",
|
|
20
|
+
output_path: Union[str, Path] = "/logs/verifier/reward.json",
|
|
21
|
+
auto_save_artifacts: bool = True,
|
|
22
|
+
artifacts_dir: Optional[Union[str, Path]] = None,
|
|
23
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
24
|
+
):
|
|
25
|
+
self.workspace = Path(workspace)
|
|
26
|
+
self.output_path = Path(output_path)
|
|
27
|
+
self.auto_save_artifacts = auto_save_artifacts
|
|
28
|
+
self.artifacts_dir = (
|
|
29
|
+
Path(artifacts_dir)
|
|
30
|
+
if artifacts_dir
|
|
31
|
+
else self.output_path.parent / "artifacts"
|
|
32
|
+
)
|
|
33
|
+
self.metadata: Dict[str, Any] = metadata or {}
|
|
34
|
+
self.criteria: List[Criterion] = []
|
|
35
|
+
self.scores: Dict[str, float] = {}
|
|
36
|
+
self.traces: List[Dict[str, Any]] = []
|
|
37
|
+
|
|
38
|
+
def criterion(self, name: str, weight: float = 1.0, fatal: bool = False):
|
|
39
|
+
"""Decorator to declare a grading criterion.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
name: Name of the criterion.
|
|
43
|
+
weight: Relative weight for scoring.
|
|
44
|
+
fatal: If True, a score of 0.0 short-circuits the entire evaluation to 0.0.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def decorator(func: Callable[[Path], Any]):
|
|
48
|
+
if any(c.name == name for c in self.criteria):
|
|
49
|
+
raise ValueError(f"Duplicate criterion name: '{name}'")
|
|
50
|
+
self.criteria.append(
|
|
51
|
+
Criterion(name=name, weight=weight, fatal=fatal, func=func)
|
|
52
|
+
)
|
|
53
|
+
return func
|
|
54
|
+
|
|
55
|
+
return decorator
|
|
56
|
+
|
|
57
|
+
def _save_artifact(self, filename: str, content: str) -> None:
|
|
58
|
+
"""Internal helper to save content to the artifacts directory."""
|
|
59
|
+
try:
|
|
60
|
+
dest = self.artifacts_dir / filename
|
|
61
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
62
|
+
dest.write_text(content, encoding="utf-8")
|
|
63
|
+
except Exception as e:
|
|
64
|
+
logging.error(f"Failed to save artifact {filename}: {e}")
|
|
65
|
+
|
|
66
|
+
def save_file(self, filename: str, content: str) -> None:
|
|
67
|
+
"""Explicitly save content to the artifacts directory."""
|
|
68
|
+
self._save_artifact(filename, content)
|
|
69
|
+
|
|
70
|
+
def save_dir(self, dirname: str) -> None:
|
|
71
|
+
"""Copy an entire directory from the workspace to the artifacts directory."""
|
|
72
|
+
src = self.workspace / dirname
|
|
73
|
+
dest = self.artifacts_dir / dirname
|
|
74
|
+
if not src.is_dir():
|
|
75
|
+
logging.warning(f"Directory {dirname} not found in workspace.")
|
|
76
|
+
return
|
|
77
|
+
try:
|
|
78
|
+
if dest.exists():
|
|
79
|
+
shutil.rmtree(dest)
|
|
80
|
+
shutil.copytree(src, dest)
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logging.error(f"Failed to save directory artifact {dirname}: {e}")
|
|
83
|
+
|
|
84
|
+
def load_json(
|
|
85
|
+
self, filename: str, save_artifact: Optional[bool] = None
|
|
86
|
+
) -> Optional[Any]:
|
|
87
|
+
"""Safely loads and parses JSON from the workspace.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
filename: Path relative to the workspace.
|
|
91
|
+
save_artifact: Whether to save a copy to the artifacts directory.
|
|
92
|
+
Defaults to the instance-level auto_save_artifacts setting.
|
|
93
|
+
"""
|
|
94
|
+
path = self.workspace / filename
|
|
95
|
+
if not path.exists():
|
|
96
|
+
logging.warning(f"File {filename} not found in workspace.")
|
|
97
|
+
return None
|
|
98
|
+
try:
|
|
99
|
+
raw = path.read_text(encoding="utf-8")
|
|
100
|
+
should_save = (
|
|
101
|
+
save_artifact if save_artifact is not None else self.auto_save_artifacts
|
|
102
|
+
)
|
|
103
|
+
if should_save:
|
|
104
|
+
self._save_artifact(filename, raw)
|
|
105
|
+
return json.loads(raw)
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logging.error(f"Error parsing JSON file {filename}: {e}")
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
def read_file(
|
|
111
|
+
self, filename: str, save_artifact: Optional[bool] = None
|
|
112
|
+
) -> Optional[str]:
|
|
113
|
+
"""Safely reads file content from the workspace.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
filename: Path relative to the workspace.
|
|
117
|
+
save_artifact: Whether to save a copy to the artifacts directory.
|
|
118
|
+
Defaults to the instance-level auto_save_artifacts setting.
|
|
119
|
+
"""
|
|
120
|
+
path = self.workspace / filename
|
|
121
|
+
if not path.exists():
|
|
122
|
+
logging.warning(f"File {filename} not found in workspace.")
|
|
123
|
+
return None
|
|
124
|
+
try:
|
|
125
|
+
content = path.read_text(encoding="utf-8")
|
|
126
|
+
should_save = (
|
|
127
|
+
save_artifact if save_artifact is not None else self.auto_save_artifacts
|
|
128
|
+
)
|
|
129
|
+
if should_save:
|
|
130
|
+
self._save_artifact(filename, content)
|
|
131
|
+
return content
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logging.error(f"Error reading file {filename}: {e}")
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
def load_trajectory(
|
|
137
|
+
self, path: str = "/logs/agent/trajectory.json"
|
|
138
|
+
) -> Optional[Trajectory]:
|
|
139
|
+
"""Load and parse the ATIF trajectory written by the agent.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
path: Absolute path to the trajectory JSON file.
|
|
143
|
+
Defaults to ``/logs/agent/trajectory.json``.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
A typed :class:`Trajectory` on success, or ``None`` if the file is
|
|
147
|
+
missing or unparseable (a warning is logged in either case).
|
|
148
|
+
"""
|
|
149
|
+
traj_path = Path(path)
|
|
150
|
+
if not traj_path.exists():
|
|
151
|
+
logging.warning(f"Trajectory file not found: {path}")
|
|
152
|
+
return None
|
|
153
|
+
try:
|
|
154
|
+
return Trajectory.model_validate_json(traj_path.read_text(encoding="utf-8"))
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logging.error(f"Failed to parse trajectory at {path}: {e}")
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
def file_exists(self, filename: str) -> bool:
|
|
160
|
+
"""Checks if a file exists in the workspace."""
|
|
161
|
+
path = self.workspace / filename
|
|
162
|
+
return path.is_file()
|
|
163
|
+
|
|
164
|
+
def dir_exists(self, dirname: str) -> bool:
|
|
165
|
+
"""Checks if a directory exists in the workspace."""
|
|
166
|
+
path = self.workspace / dirname
|
|
167
|
+
return path.is_dir()
|
|
168
|
+
|
|
169
|
+
def llm_judge(
|
|
170
|
+
self,
|
|
171
|
+
response_model: Type[BaseModel],
|
|
172
|
+
system: str,
|
|
173
|
+
prompt: str,
|
|
174
|
+
model: str,
|
|
175
|
+
client: Optional[Any] = None,
|
|
176
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
177
|
+
**kwargs,
|
|
178
|
+
) -> Any:
|
|
179
|
+
"""Call instructor LLM judge with structured responses and trace the call.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
response_model: Pydantic model for structured output.
|
|
183
|
+
system: System prompt text.
|
|
184
|
+
prompt: User prompt text.
|
|
185
|
+
model: Model identifier (e.g. "google/gemini-3.1-flash-lite").
|
|
186
|
+
client: Optional pre-configured instructor client.
|
|
187
|
+
metadata: Optional per-call metadata dict. Merged with evaluator-level
|
|
188
|
+
metadata for experiment tracking.
|
|
189
|
+
**kwargs: Additional arguments passed to client.create().
|
|
190
|
+
"""
|
|
191
|
+
import instructor
|
|
192
|
+
|
|
193
|
+
call_metadata = metadata or {}
|
|
194
|
+
|
|
195
|
+
# Merge metadata: evaluator-level -> per-call
|
|
196
|
+
merged_metadata = {
|
|
197
|
+
**self.metadata,
|
|
198
|
+
**call_metadata,
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
trace_data = {
|
|
202
|
+
"model": model,
|
|
203
|
+
"system": system,
|
|
204
|
+
"prompt": prompt,
|
|
205
|
+
"kwargs": {k: repr(v) for k, v in kwargs.items()},
|
|
206
|
+
"response_model_schema": response_model.model_json_schema(),
|
|
207
|
+
"metadata": merged_metadata,
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
if client is None:
|
|
212
|
+
client = instructor.from_provider(model=model)
|
|
213
|
+
|
|
214
|
+
model_name = model.split("/")[-1]
|
|
215
|
+
result = client.create(
|
|
216
|
+
model=model_name,
|
|
217
|
+
response_model=response_model,
|
|
218
|
+
messages=[
|
|
219
|
+
{"role": "system", "content": system},
|
|
220
|
+
{"role": "user", "content": prompt},
|
|
221
|
+
],
|
|
222
|
+
**kwargs,
|
|
223
|
+
)
|
|
224
|
+
# Record trace on success
|
|
225
|
+
success_trace = {
|
|
226
|
+
**trace_data,
|
|
227
|
+
"response": result.model_dump(),
|
|
228
|
+
"status": "success",
|
|
229
|
+
}
|
|
230
|
+
self.traces.append(success_trace)
|
|
231
|
+
return result
|
|
232
|
+
except Exception as e:
|
|
233
|
+
# Record trace on failure
|
|
234
|
+
failed_trace = {
|
|
235
|
+
**trace_data,
|
|
236
|
+
"error": str(e),
|
|
237
|
+
"status": "failed",
|
|
238
|
+
}
|
|
239
|
+
self.traces.append(failed_trace)
|
|
240
|
+
raise e
|
|
241
|
+
|
|
242
|
+
def _score_criterion(self, crit: Criterion) -> float:
|
|
243
|
+
"""Run a single criterion and coerce its result to a float score.
|
|
244
|
+
|
|
245
|
+
A crash inside the criterion is caught and scored 0.0. A return value
|
|
246
|
+
that is not ``bool | int | float`` raises ``ValueError`` (a likely
|
|
247
|
+
forgotten ``return``).
|
|
248
|
+
"""
|
|
249
|
+
try:
|
|
250
|
+
res = crit.func(self.workspace)
|
|
251
|
+
except Exception as e:
|
|
252
|
+
logging.error(
|
|
253
|
+
f"Failed executing criterion '{crit.name}': {e}", exc_info=True
|
|
254
|
+
)
|
|
255
|
+
print(
|
|
256
|
+
f"CRITERION: {crit.name} (weight={crit.weight}) -> FAILED (Score: 0.0)"
|
|
257
|
+
)
|
|
258
|
+
return 0.0
|
|
259
|
+
|
|
260
|
+
if not isinstance(res, (bool, int, float)):
|
|
261
|
+
raise ValueError(
|
|
262
|
+
f"Criterion '{crit.name}' must return bool | int | float, "
|
|
263
|
+
f"got {type(res).__name__}. Did you forget a return?"
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
score = float(res) # float(True) == 1.0, float(False) == 0.0
|
|
267
|
+
print(f"CRITERION: {crit.name} (weight={crit.weight}) -> Score: {score}")
|
|
268
|
+
return score
|
|
269
|
+
|
|
270
|
+
def run(self):
|
|
271
|
+
"""Executes all criteria, aggregates weighted scores, and writes outputs."""
|
|
272
|
+
total_weight = 0.0
|
|
273
|
+
weighted_score = 0.0
|
|
274
|
+
|
|
275
|
+
print("=== Start Evaluation ===")
|
|
276
|
+
for crit in self.criteria:
|
|
277
|
+
total_weight += crit.weight
|
|
278
|
+
score = self._score_criterion(crit)
|
|
279
|
+
self.scores[crit.name] = score
|
|
280
|
+
weighted_score += score * crit.weight
|
|
281
|
+
|
|
282
|
+
if crit.fatal and score == 0.0:
|
|
283
|
+
print(
|
|
284
|
+
f"FATAL: Criterion '{crit.name}' failed. Short-circuiting to 0.0."
|
|
285
|
+
)
|
|
286
|
+
self._write_outputs(0.0)
|
|
287
|
+
return
|
|
288
|
+
|
|
289
|
+
final_reward = (weighted_score / total_weight) if total_weight > 0 else 0.0
|
|
290
|
+
self._write_outputs(final_reward)
|
|
291
|
+
|
|
292
|
+
def _write_outputs(self, final_reward: float):
|
|
293
|
+
"""Write all output files (reward, traces)."""
|
|
294
|
+
print(f"Final Computed Reward: {final_reward:.4f}")
|
|
295
|
+
|
|
296
|
+
# Ensure output directories exist
|
|
297
|
+
self.output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
298
|
+
|
|
299
|
+
# Write legacy reward.txt format
|
|
300
|
+
reward_txt = self.output_path.with_name("reward.txt")
|
|
301
|
+
try:
|
|
302
|
+
reward_txt.write_text(f"{final_reward:.4f}\n")
|
|
303
|
+
except Exception as e:
|
|
304
|
+
logging.error(f"Failed to write reward.txt: {e}")
|
|
305
|
+
|
|
306
|
+
# Write structured reward.json — flat dict[str, float|int] for Harbor compatibility
|
|
307
|
+
# Each criterion score is a top-level key alongside 'reward'
|
|
308
|
+
output_data = {"reward": final_reward, **self.scores}
|
|
309
|
+
try:
|
|
310
|
+
self.output_path.write_text(json.dumps(output_data, indent=2))
|
|
311
|
+
except Exception as e:
|
|
312
|
+
logging.error(f"Failed to write reward.json: {e}")
|
|
313
|
+
|
|
314
|
+
# Write metadata.json if metadata was provided
|
|
315
|
+
if self.metadata:
|
|
316
|
+
metadata_path = self.output_path.parent / "metadata.json"
|
|
317
|
+
try:
|
|
318
|
+
metadata_path.write_text(json.dumps(self.metadata, indent=2))
|
|
319
|
+
except Exception as e:
|
|
320
|
+
logging.error(f"Failed to write metadata.json: {e}")
|
|
321
|
+
|
|
322
|
+
# Write LLM judge traces
|
|
323
|
+
traces_path = self.output_path.parent / "traces.json"
|
|
324
|
+
try:
|
|
325
|
+
traces_path.write_text(json.dumps(self.traces, indent=2))
|
|
326
|
+
except Exception as e:
|
|
327
|
+
logging.error(f"Failed to write traces.json: {e}")
|
|
328
|
+
|
|
329
|
+
print("=== Evaluation Finished ===")
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Shared types for graded verifiers.
|
|
2
|
+
|
|
3
|
+
Contains the grading :class:`Criterion` and the lightweight ATIF trajectory
|
|
4
|
+
types (:class:`Trajectory`, :class:`Step`, :class:`ToolCall`). The trajectory
|
|
5
|
+
types are intentionally minimal read-only mirrors of the ATIF types defined in
|
|
6
|
+
``harbor.models.trajectories``. They use ``extra="ignore"`` throughout so that
|
|
7
|
+
forward-compatible ATIF schema additions (new fields, new versions) never cause
|
|
8
|
+
parse failures in verifier scripts.
|
|
9
|
+
|
|
10
|
+
Use ``Evaluator.load_trajectory()`` to get a typed ``Trajectory`` from the
|
|
11
|
+
agent log written during a trial.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Callable
|
|
18
|
+
|
|
19
|
+
from pydantic import BaseModel, Field
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Criterion(BaseModel):
|
|
23
|
+
"""A single grading criterion registered via ``Evaluator.criterion``."""
|
|
24
|
+
|
|
25
|
+
name: str
|
|
26
|
+
weight: float = 1.0
|
|
27
|
+
fatal: bool = False
|
|
28
|
+
func: Callable[[Path], Any]
|
|
29
|
+
|
|
30
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ToolCall(BaseModel):
|
|
34
|
+
"""A single tool invocation within an agent step."""
|
|
35
|
+
|
|
36
|
+
tool_call_id: str
|
|
37
|
+
function_name: str
|
|
38
|
+
arguments: dict[str, Any] = Field(default_factory=dict)
|
|
39
|
+
|
|
40
|
+
model_config = {"extra": "ignore"}
|
|
41
|
+
|
|
42
|
+
def arg(self, key: str, default: Any = None) -> Any:
|
|
43
|
+
"""Convenience accessor for a single argument value."""
|
|
44
|
+
return self.arguments.get(key, default)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Step(BaseModel):
|
|
48
|
+
"""One turn in the agent trajectory."""
|
|
49
|
+
|
|
50
|
+
step_id: int
|
|
51
|
+
source: str # "user" | "agent" | "system"
|
|
52
|
+
message: Any = "" # str or list[ContentPart] — we accept either
|
|
53
|
+
tool_calls: list[ToolCall] | None = None
|
|
54
|
+
|
|
55
|
+
model_config = {"extra": "ignore"}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class Trajectory(BaseModel):
|
|
59
|
+
"""Parsed ATIF trajectory. Exposes a small query API for verifier use."""
|
|
60
|
+
|
|
61
|
+
schema_version: str = ""
|
|
62
|
+
session_id: str | None = None
|
|
63
|
+
steps: list[Step] = Field(default_factory=list)
|
|
64
|
+
|
|
65
|
+
model_config = {"extra": "ignore"}
|
|
66
|
+
|
|
67
|
+
# ------------------------------------------------------------------
|
|
68
|
+
# Query primitives
|
|
69
|
+
# ------------------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
def all_tool_calls(self) -> list[ToolCall]:
|
|
72
|
+
"""Flat list of every tool call made across all agent steps."""
|
|
73
|
+
return [
|
|
74
|
+
tc
|
|
75
|
+
for step in self.steps
|
|
76
|
+
if step.tool_calls
|
|
77
|
+
for tc in step.tool_calls
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
def tool_calls_for(self, function_name: str) -> list[ToolCall]:
|
|
81
|
+
"""All tool calls whose ``function_name`` matches exactly.
|
|
82
|
+
|
|
83
|
+
Equivalent to ``find_all(function_name)``.
|
|
84
|
+
"""
|
|
85
|
+
return self.find_all(function_name)
|
|
86
|
+
|
|
87
|
+
def exists(
|
|
88
|
+
self,
|
|
89
|
+
function_name: str,
|
|
90
|
+
predicate: Callable[[ToolCall], bool] | None = None,
|
|
91
|
+
) -> bool:
|
|
92
|
+
"""Return ``True`` if any tool call matches ``function_name`` and,
|
|
93
|
+
optionally, satisfies ``predicate``.
|
|
94
|
+
|
|
95
|
+
Examples::
|
|
96
|
+
|
|
97
|
+
trajectory.exists("write_file")
|
|
98
|
+
trajectory.exists(
|
|
99
|
+
"write_file",
|
|
100
|
+
lambda tc: PurePosixPath(tc.arg("path", "")).name == "blog_post.md",
|
|
101
|
+
)
|
|
102
|
+
trajectory.exists("terminal", lambda tc: "pytest" in tc.arg("command", ""))
|
|
103
|
+
"""
|
|
104
|
+
return self.find(function_name, predicate) is not None
|
|
105
|
+
|
|
106
|
+
def find(
|
|
107
|
+
self,
|
|
108
|
+
function_name: str,
|
|
109
|
+
predicate: Callable[[ToolCall], bool] | None = None,
|
|
110
|
+
) -> ToolCall | None:
|
|
111
|
+
"""Return the first :class:`ToolCall` matching ``function_name`` (and
|
|
112
|
+
``predicate`` if given), or ``None`` if no match is found.
|
|
113
|
+
|
|
114
|
+
Example::
|
|
115
|
+
|
|
116
|
+
tc = trajectory.find("write_file")
|
|
117
|
+
if tc:
|
|
118
|
+
print(tc.arg("path"))
|
|
119
|
+
"""
|
|
120
|
+
for tc in self.all_tool_calls():
|
|
121
|
+
if tc.function_name != function_name:
|
|
122
|
+
continue
|
|
123
|
+
if predicate is None or predicate(tc):
|
|
124
|
+
return tc
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
def find_all(
|
|
128
|
+
self,
|
|
129
|
+
function_name: str,
|
|
130
|
+
predicate: Callable[[ToolCall], bool] | None = None,
|
|
131
|
+
) -> list[ToolCall]:
|
|
132
|
+
"""Return all :class:`ToolCall` objects matching ``function_name`` (and
|
|
133
|
+
``predicate`` if given).
|
|
134
|
+
|
|
135
|
+
Example::
|
|
136
|
+
|
|
137
|
+
calls = trajectory.find_all("write_file")
|
|
138
|
+
paths = [tc.arg("path") for tc in calls]
|
|
139
|
+
"""
|
|
140
|
+
return [
|
|
141
|
+
tc
|
|
142
|
+
for tc in self.all_tool_calls()
|
|
143
|
+
if tc.function_name == function_name
|
|
144
|
+
and (predicate is None or predicate(tc))
|
|
145
|
+
]
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: graded
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Defensive verifier framework and helpers for Harbor evaluations
|
|
5
|
+
Classifier: Programming Language :: Python :: 3
|
|
6
|
+
Classifier: Operating System :: OS Independent
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: pydantic>=2.0
|
|
10
|
+
Requires-Dist: instructor>=1.0.0
|
|
11
|
+
Requires-Dist: jsonref>=1.1.0
|
|
12
|
+
Requires-Dist: google-genai>=1.47.0
|
|
13
|
+
|
|
14
|
+
# graded 🍳
|
|
15
|
+
|
|
16
|
+
`graded` is a defensive verifier and grading framework designed for agent evaluations (particularly for Harbor agent evaluations). It allows you to declare structured grading criteria, leverage LLM judges with automatic tracing, and safely manage evaluation artifacts.
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
Install `graded` directly from PyPI (or your internal registry):
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install graded
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Or with `uv`:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
uv pip install graded
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Quick Start
|
|
37
|
+
|
|
38
|
+
Create an evaluation script (e.g. `verify.py`) to grade a task workspace:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from pathlib import Path
|
|
42
|
+
from graded import Evaluator
|
|
43
|
+
|
|
44
|
+
# Initialize the evaluator
|
|
45
|
+
ev = Evaluator(
|
|
46
|
+
workspace="/workspace",
|
|
47
|
+
output_path="/logs/verifier/reward.json",
|
|
48
|
+
auto_save_artifacts=True
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# 1. Declare a standard criterion
|
|
52
|
+
@ev.criterion(name="has_output_file", weight=1.0)
|
|
53
|
+
def check_output(workspace: Path) -> bool:
|
|
54
|
+
return (workspace / "output.txt").is_file()
|
|
55
|
+
|
|
56
|
+
# 2. Declare a fatal criterion (short-circuits score to 0.0 if failed)
|
|
57
|
+
@ev.criterion(name="no_syntax_errors", weight=2.0, fatal=True)
|
|
58
|
+
def check_syntax(workspace: Path) -> bool:
|
|
59
|
+
# return True or False (or float 0.0 - 1.0)
|
|
60
|
+
return True
|
|
61
|
+
|
|
62
|
+
# 3. Declare a fractional scoring criterion
|
|
63
|
+
@ev.criterion(name="test_pass_rate", weight=3.0)
|
|
64
|
+
def check_tests(workspace: Path) -> float:
|
|
65
|
+
# Returns a score between 0.0 and 1.0
|
|
66
|
+
return 0.8 # e.g., 80% of tests passed
|
|
67
|
+
|
|
68
|
+
# Run the evaluation and write outputs
|
|
69
|
+
if __name__ == "__main__":
|
|
70
|
+
ev.run()
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Core Features
|
|
76
|
+
|
|
77
|
+
### 1. Criteria Declarations (`@ev.criterion`)
|
|
78
|
+
Define check functions using the `@ev.criterion` decorator.
|
|
79
|
+
- **`name`**: The unique identifier for the criterion.
|
|
80
|
+
- **`weight`**: Relative weight of the score in the final weighted average calculation.
|
|
81
|
+
- **`fatal`**: If set to `True`, any score of `0.0` or `False` immediately short-circuits the final score to `0.0`.
|
|
82
|
+
- **Return Value**: Must return a `bool`, `int`, or `float`. Anything else raises a `ValueError`.
|
|
83
|
+
|
|
84
|
+
### 2. LLM Judge with Automatic Tracing
|
|
85
|
+
`graded` integrates with `instructor` to run structured, schema-validated LLM grading prompts, automatically logging prompt, parameters, response schema, and LLM responses to `traces.json`.
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from pydantic import BaseModel, Field
|
|
89
|
+
|
|
90
|
+
class Rubric(BaseModel):
|
|
91
|
+
score: float = Field(description="Score between 0.0 and 1.0 based on correctness.")
|
|
92
|
+
reasoning: str = Field(description="Detailed reasoning for the score.")
|
|
93
|
+
|
|
94
|
+
# In your criterion:
|
|
95
|
+
result = ev.llm_judge(
|
|
96
|
+
model="google/gemini-3.5-flash",
|
|
97
|
+
response_model=Rubric,
|
|
98
|
+
system="You are a strict code correctness evaluator.",
|
|
99
|
+
prompt="Compare the student's solution in code.py with the requirements...",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
print(f"LLM Score: {result.score}")
|
|
103
|
+
print(f"Reasoning: {result.reasoning}")
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### 3. File & Artifact Management
|
|
107
|
+
Safely access files and copy evaluation artifacts to the logs directory for post-evaluation review.
|
|
108
|
+
|
|
109
|
+
- **`ev.read_file(filename)`**: Safely reads content as a string. Auto-saves a copy to artifacts.
|
|
110
|
+
- **`ev.load_json(filename)`**: Safely parses JSON file content. Auto-saves a copy to artifacts.
|
|
111
|
+
- **`ev.save_file(filename, content)`**: Save arbitrary text/data to the artifacts directory.
|
|
112
|
+
- **`ev.save_dir(dirname)`**: Copy an entire directory from the workspace to the artifacts directory.
|
|
113
|
+
- **`ev.load_trajectory(path)`**: Load and parse an agent's ATIF `trajectory.json` file into a typed `Trajectory` object.
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## Outputs
|
|
118
|
+
|
|
119
|
+
When `ev.run()` completes, the following files are written to the directory containing your configured `output_path`:
|
|
120
|
+
|
|
121
|
+
1. **`reward.json`**: A flat JSON dictionary containing the final calculated `reward` and the individual scores for each criterion:
|
|
122
|
+
```json
|
|
123
|
+
{
|
|
124
|
+
"reward": 0.75,
|
|
125
|
+
"has_output_file": 1.0,
|
|
126
|
+
"no_syntax_errors": 1.0,
|
|
127
|
+
"test_pass_rate": 0.8
|
|
128
|
+
}
|
|
129
|
+
```
|
|
130
|
+
2. **`reward.txt`**: A text file containing just the final reward float value (e.g. `0.7500\n`).
|
|
131
|
+
3. **`traces.json`**: A list of structured LLM calls made via `ev.llm_judge`, detailing inputs, responses, latencies, and metadata.
|
|
132
|
+
4. **`metadata.json`**: (Optional) Contains evaluator-level and run-level metadata.
|
|
133
|
+
5. **`artifacts/`**: Subfolder containing copy-back files preserved during the evaluation run.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/graded/__init__.py
|
|
4
|
+
src/graded/evaluator.py
|
|
5
|
+
src/graded/types.py
|
|
6
|
+
src/graded.egg-info/PKG-INFO
|
|
7
|
+
src/graded.egg-info/SOURCES.txt
|
|
8
|
+
src/graded.egg-info/dependency_links.txt
|
|
9
|
+
src/graded.egg-info/requires.txt
|
|
10
|
+
src/graded.egg-info/top_level.txt
|
|
11
|
+
tests/test_evaluator_artifacts.py
|
|
12
|
+
tests/test_evaluator_io.py
|
|
13
|
+
tests/test_evaluator_llm.py
|
|
14
|
+
tests/test_evaluator_scoring.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
graded
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from graded import Evaluator
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_save_file(workspace_setup):
|
|
7
|
+
ev = workspace_setup["evaluator"]
|
|
8
|
+
output_path = workspace_setup["output_path"]
|
|
9
|
+
|
|
10
|
+
ev.save_file("captured.md", "# Hello World")
|
|
11
|
+
|
|
12
|
+
dest = output_path.parent / "artifacts" / "captured.md"
|
|
13
|
+
assert dest.exists()
|
|
14
|
+
assert dest.read_text() == "# Hello World"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_save_file_nested(workspace_setup):
|
|
18
|
+
ev = workspace_setup["evaluator"]
|
|
19
|
+
output_path = workspace_setup["output_path"]
|
|
20
|
+
|
|
21
|
+
ev.save_file("sub/dir/deep.txt", "nested content")
|
|
22
|
+
|
|
23
|
+
dest = output_path.parent / "artifacts" / "sub" / "dir" / "deep.txt"
|
|
24
|
+
assert dest.exists()
|
|
25
|
+
assert dest.read_text() == "nested content"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_save_dir(workspace_setup):
|
|
29
|
+
ws = workspace_setup["workspace"]
|
|
30
|
+
ev = workspace_setup["evaluator"]
|
|
31
|
+
output_path = workspace_setup["output_path"]
|
|
32
|
+
|
|
33
|
+
# Create a directory with files in the workspace
|
|
34
|
+
(ws / "mydir").mkdir()
|
|
35
|
+
(ws / "mydir" / "a.txt").write_text("aaa")
|
|
36
|
+
(ws / "mydir" / "b.txt").write_text("bbb")
|
|
37
|
+
|
|
38
|
+
ev.save_dir("mydir")
|
|
39
|
+
|
|
40
|
+
artifacts_dir = output_path.parent / "artifacts" / "mydir"
|
|
41
|
+
assert artifacts_dir.is_dir()
|
|
42
|
+
assert (artifacts_dir / "a.txt").read_text() == "aaa"
|
|
43
|
+
assert (artifacts_dir / "b.txt").read_text() == "bbb"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_save_dir_missing(workspace_setup):
|
|
47
|
+
ev = workspace_setup["evaluator"]
|
|
48
|
+
output_path = workspace_setup["output_path"]
|
|
49
|
+
|
|
50
|
+
# Should not raise, just log a warning
|
|
51
|
+
ev.save_dir("nonexistent")
|
|
52
|
+
|
|
53
|
+
artifacts_dir = output_path.parent / "artifacts" / "nonexistent"
|
|
54
|
+
assert not artifacts_dir.exists()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_auto_capture_read_file(tmp_path):
|
|
58
|
+
ws = tmp_path / "workspace"
|
|
59
|
+
ws.mkdir()
|
|
60
|
+
output_path = tmp_path / "logs" / "reward.json"
|
|
61
|
+
|
|
62
|
+
ev = Evaluator(workspace=ws, output_path=output_path, auto_save_artifacts=True)
|
|
63
|
+
|
|
64
|
+
(ws / "doc.md").write_text("auto captured content")
|
|
65
|
+
result = ev.read_file("doc.md")
|
|
66
|
+
|
|
67
|
+
assert result == "auto captured content"
|
|
68
|
+
dest = output_path.parent / "artifacts" / "doc.md"
|
|
69
|
+
assert dest.exists()
|
|
70
|
+
assert dest.read_text() == "auto captured content"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_auto_capture_load_json(tmp_path):
|
|
74
|
+
ws = tmp_path / "workspace"
|
|
75
|
+
ws.mkdir()
|
|
76
|
+
output_path = tmp_path / "logs" / "reward.json"
|
|
77
|
+
|
|
78
|
+
ev = Evaluator(workspace=ws, output_path=output_path, auto_save_artifacts=True)
|
|
79
|
+
|
|
80
|
+
(ws / "data.json").write_text(json.dumps({"key": "value"}))
|
|
81
|
+
result = ev.load_json("data.json")
|
|
82
|
+
|
|
83
|
+
assert result == {"key": "value"}
|
|
84
|
+
dest = output_path.parent / "artifacts" / "data.json"
|
|
85
|
+
assert dest.exists()
|
|
86
|
+
assert json.loads(dest.read_text()) == {"key": "value"}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def test_auto_capture_disabled_globally(tmp_path):
|
|
90
|
+
ws = tmp_path / "workspace"
|
|
91
|
+
ws.mkdir()
|
|
92
|
+
output_path = tmp_path / "logs" / "reward.json"
|
|
93
|
+
|
|
94
|
+
ev = Evaluator(workspace=ws, output_path=output_path, auto_save_artifacts=False)
|
|
95
|
+
|
|
96
|
+
(ws / "doc.md").write_text("should not be captured")
|
|
97
|
+
ev.read_file("doc.md")
|
|
98
|
+
|
|
99
|
+
dest = output_path.parent / "artifacts" / "doc.md"
|
|
100
|
+
assert not dest.exists()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def test_per_call_override_save(tmp_path):
|
|
104
|
+
ws = tmp_path / "workspace"
|
|
105
|
+
ws.mkdir()
|
|
106
|
+
output_path = tmp_path / "logs" / "reward.json"
|
|
107
|
+
|
|
108
|
+
# Auto-save OFF globally, but override ON per-call
|
|
109
|
+
ev = Evaluator(workspace=ws, output_path=output_path, auto_save_artifacts=False)
|
|
110
|
+
|
|
111
|
+
(ws / "important.md").write_text("save me")
|
|
112
|
+
ev.read_file("important.md", save_artifact=True)
|
|
113
|
+
|
|
114
|
+
dest = output_path.parent / "artifacts" / "important.md"
|
|
115
|
+
assert dest.exists()
|
|
116
|
+
assert dest.read_text() == "save me"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_per_call_override_skip(tmp_path):
|
|
120
|
+
ws = tmp_path / "workspace"
|
|
121
|
+
ws.mkdir()
|
|
122
|
+
output_path = tmp_path / "logs" / "reward.json"
|
|
123
|
+
|
|
124
|
+
# Auto-save ON globally, but override OFF per-call
|
|
125
|
+
ev = Evaluator(workspace=ws, output_path=output_path, auto_save_artifacts=True)
|
|
126
|
+
|
|
127
|
+
(ws / "config.yaml").write_text("skip me")
|
|
128
|
+
ev.read_file("config.yaml", save_artifact=False)
|
|
129
|
+
|
|
130
|
+
dest = output_path.parent / "artifacts" / "config.yaml"
|
|
131
|
+
assert not dest.exists()
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
def test_load_json_success(workspace_setup):
|
|
4
|
+
ws = workspace_setup["workspace"]
|
|
5
|
+
ev = workspace_setup["evaluator"]
|
|
6
|
+
|
|
7
|
+
(ws / "valid.json").write_text(json.dumps({"a": 1}))
|
|
8
|
+
|
|
9
|
+
data = ev.load_json("valid.json")
|
|
10
|
+
assert data == {"a": 1}
|
|
11
|
+
|
|
12
|
+
def test_load_json_missing(workspace_setup):
|
|
13
|
+
ev = workspace_setup["evaluator"]
|
|
14
|
+
assert ev.load_json("missing.json") is None
|
|
15
|
+
|
|
16
|
+
def test_load_json_invalid(workspace_setup):
|
|
17
|
+
ws = workspace_setup["workspace"]
|
|
18
|
+
ev = workspace_setup["evaluator"]
|
|
19
|
+
|
|
20
|
+
(ws / "invalid.json").write_text("{invalid")
|
|
21
|
+
|
|
22
|
+
assert ev.load_json("invalid.json") is None
|
|
23
|
+
|
|
24
|
+
def test_read_file_success(workspace_setup):
|
|
25
|
+
ws = workspace_setup["workspace"]
|
|
26
|
+
ev = workspace_setup["evaluator"]
|
|
27
|
+
|
|
28
|
+
(ws / "doc.txt").write_text("hello world")
|
|
29
|
+
|
|
30
|
+
assert ev.read_file("doc.txt") == "hello world"
|
|
31
|
+
|
|
32
|
+
def test_read_file_missing(workspace_setup):
|
|
33
|
+
ev = workspace_setup["evaluator"]
|
|
34
|
+
assert ev.read_file("missing.txt") is None
|
|
35
|
+
|
|
36
|
+
def test_file_exists(workspace_setup):
|
|
37
|
+
ws = workspace_setup["workspace"]
|
|
38
|
+
ev = workspace_setup["evaluator"]
|
|
39
|
+
|
|
40
|
+
assert not ev.file_exists("foo.txt")
|
|
41
|
+
(ws / "foo.txt").write_text("hello")
|
|
42
|
+
assert ev.file_exists("foo.txt")
|
|
43
|
+
|
|
44
|
+
# directories are not files
|
|
45
|
+
(ws / "bar").mkdir()
|
|
46
|
+
assert not ev.file_exists("bar")
|
|
47
|
+
|
|
48
|
+
def test_dir_exists(workspace_setup):
|
|
49
|
+
ws = workspace_setup["workspace"]
|
|
50
|
+
ev = workspace_setup["evaluator"]
|
|
51
|
+
|
|
52
|
+
assert not ev.dir_exists("bar")
|
|
53
|
+
(ws / "bar").mkdir()
|
|
54
|
+
assert ev.dir_exists("bar")
|
|
55
|
+
|
|
56
|
+
# files are not directories
|
|
57
|
+
(ws / "foo.txt").write_text("hello")
|
|
58
|
+
assert not ev.dir_exists("foo.txt")
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
from conftest import DummyRubric
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_llm_judge_real_success(workspace_setup):
|
|
8
|
+
if not os.environ.get("GEMINI_API_KEY"):
|
|
9
|
+
pytest.skip("GEMINI_API_KEY environment variable not set")
|
|
10
|
+
|
|
11
|
+
ev = workspace_setup["evaluator"]
|
|
12
|
+
|
|
13
|
+
result = ev.llm_judge(
|
|
14
|
+
model="google/gemini-3.5-flash",
|
|
15
|
+
response_model=DummyRubric,
|
|
16
|
+
system="You are a strict helper grader.",
|
|
17
|
+
prompt="Please grade the politeness of this string: 'Hello, could you please help me with my task?'",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
assert isinstance(result, DummyRubric)
|
|
21
|
+
assert 0.0 <= result.score <= 1.0
|
|
22
|
+
assert len(result.reasoning) > 0
|
|
23
|
+
|
|
24
|
+
# Verify traces are stored
|
|
25
|
+
assert len(ev.traces) == 1
|
|
26
|
+
assert ev.traces[0] == {
|
|
27
|
+
"model": "google/gemini-3.5-flash",
|
|
28
|
+
"system": "You are a strict helper grader.",
|
|
29
|
+
"prompt": "Please grade the politeness of this string: 'Hello, could you please help me with my task?'",
|
|
30
|
+
"kwargs": {},
|
|
31
|
+
"response_model_schema": DummyRubric.model_json_schema(),
|
|
32
|
+
"status": "success",
|
|
33
|
+
"response": {"score": result.score, "reasoning": result.reasoning},
|
|
34
|
+
"metadata": {},
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_llm_judge_real_failure(workspace_setup):
|
|
39
|
+
if not os.environ.get("GEMINI_API_KEY"):
|
|
40
|
+
pytest.skip("GEMINI_API_KEY environment variable not set")
|
|
41
|
+
|
|
42
|
+
ev = workspace_setup["evaluator"]
|
|
43
|
+
|
|
44
|
+
# Use an invalid model name to force a real API / SDK validation error
|
|
45
|
+
with pytest.raises(Exception) as exc_info:
|
|
46
|
+
ev.llm_judge(
|
|
47
|
+
model="google/invalid-model-name-does-not-exist",
|
|
48
|
+
response_model=DummyRubric,
|
|
49
|
+
system="be strict",
|
|
50
|
+
prompt="evaluate guide",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Verify traces recorded failure
|
|
54
|
+
assert len(ev.traces) == 1
|
|
55
|
+
assert ev.traces[0] == {
|
|
56
|
+
"model": "google/invalid-model-name-does-not-exist",
|
|
57
|
+
"system": "be strict",
|
|
58
|
+
"prompt": "evaluate guide",
|
|
59
|
+
"kwargs": {},
|
|
60
|
+
"response_model_schema": DummyRubric.model_json_schema(),
|
|
61
|
+
"status": "failed",
|
|
62
|
+
"error": str(exc_info.value),
|
|
63
|
+
"metadata": {},
|
|
64
|
+
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
def test_criterion_duplicate_name_rejected(workspace_setup):
|
|
5
|
+
ev = workspace_setup["evaluator"]
|
|
6
|
+
|
|
7
|
+
@ev.criterion("same_name")
|
|
8
|
+
def check_a(ws):
|
|
9
|
+
return True
|
|
10
|
+
|
|
11
|
+
with pytest.raises(ValueError, match="Duplicate criterion name: 'same_name'"):
|
|
12
|
+
@ev.criterion("same_name")
|
|
13
|
+
def check_b(ws):
|
|
14
|
+
return True
|
|
15
|
+
|
|
16
|
+
def test_criterion_registration(workspace_setup):
|
|
17
|
+
ev = workspace_setup["evaluator"]
|
|
18
|
+
|
|
19
|
+
@ev.criterion("check_1", weight=2.5)
|
|
20
|
+
def my_check(ws):
|
|
21
|
+
return True
|
|
22
|
+
|
|
23
|
+
assert len(ev.criteria) == 1
|
|
24
|
+
assert ev.criteria[0].name == "check_1"
|
|
25
|
+
assert ev.criteria[0].weight == 2.5
|
|
26
|
+
assert ev.criteria[0].func == my_check
|
|
27
|
+
assert ev.criteria[0].fatal == False
|
|
28
|
+
|
|
29
|
+
def test_run_weighted_scoring(workspace_setup):
|
|
30
|
+
ev = workspace_setup["evaluator"]
|
|
31
|
+
output_path = workspace_setup["output_path"]
|
|
32
|
+
|
|
33
|
+
@ev.criterion("check_true", weight=3.0)
|
|
34
|
+
def check_true(ws):
|
|
35
|
+
return True
|
|
36
|
+
|
|
37
|
+
@ev.criterion("check_false", weight=1.0)
|
|
38
|
+
def check_false(ws):
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
@ev.criterion("check_float", weight=2.0)
|
|
42
|
+
def check_float(ws):
|
|
43
|
+
return 0.5
|
|
44
|
+
|
|
45
|
+
ev.run()
|
|
46
|
+
|
|
47
|
+
# Total weight = 3.0 + 1.0 + 2.0 = 6.0
|
|
48
|
+
# Weighted score = 1.0 * 3.0 + 0.0 * 1.0 + 0.5 * 2.0 = 4.0
|
|
49
|
+
# Expected reward = 4.0 / 6.0 = 0.6667
|
|
50
|
+
|
|
51
|
+
assert output_path.exists()
|
|
52
|
+
reward_data = json.loads(output_path.read_text())
|
|
53
|
+
reward_data["reward"] = round(reward_data["reward"], 4)
|
|
54
|
+
assert reward_data == {
|
|
55
|
+
"reward": 0.6667,
|
|
56
|
+
"check_true": 1.0,
|
|
57
|
+
"check_false": 0.0,
|
|
58
|
+
"check_float": 0.5
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
def test_run_handles_exceptions(workspace_setup):
|
|
62
|
+
ev = workspace_setup["evaluator"]
|
|
63
|
+
output_path = workspace_setup["output_path"]
|
|
64
|
+
|
|
65
|
+
@ev.criterion("check_pass", weight=1.0)
|
|
66
|
+
def check_pass(ws):
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
@ev.criterion("check_crash", weight=1.0)
|
|
70
|
+
def check_crash(ws):
|
|
71
|
+
raise ValueError("Simulated crash")
|
|
72
|
+
|
|
73
|
+
ev.run()
|
|
74
|
+
|
|
75
|
+
# Total weight = 2.0
|
|
76
|
+
# Weighted score = 1.0 * 1.0 + 0.0 * 1.0 = 1.0
|
|
77
|
+
# Expected reward = 1.0 / 2.0 = 0.5
|
|
78
|
+
|
|
79
|
+
reward_data = json.loads(output_path.read_text())
|
|
80
|
+
assert reward_data == {
|
|
81
|
+
"reward": 0.5,
|
|
82
|
+
"check_pass": 1.0,
|
|
83
|
+
"check_crash": 0.0
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
def test_fatal_criterion_fails(workspace_setup):
|
|
87
|
+
ev = workspace_setup["evaluator"]
|
|
88
|
+
output_path = workspace_setup["output_path"]
|
|
89
|
+
|
|
90
|
+
@ev.criterion("file_check", weight=0.10, fatal=True)
|
|
91
|
+
def check_file(ws):
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
@ev.criterion("content_check", weight=0.90)
|
|
95
|
+
def check_content(ws):
|
|
96
|
+
return 1.0
|
|
97
|
+
|
|
98
|
+
ev.run()
|
|
99
|
+
|
|
100
|
+
reward_data = json.loads(output_path.read_text())
|
|
101
|
+
# Fatal criterion failed -> reward is 0.0, content_check never ran
|
|
102
|
+
assert reward_data["reward"] == 0.0
|
|
103
|
+
assert "content_check" not in reward_data
|
|
104
|
+
|
|
105
|
+
def test_criterion_invalid_return_type_raises(workspace_setup):
|
|
106
|
+
ev = workspace_setup["evaluator"]
|
|
107
|
+
|
|
108
|
+
@ev.criterion("forgot_return")
|
|
109
|
+
def forgot_return(ws):
|
|
110
|
+
pass # returns None
|
|
111
|
+
|
|
112
|
+
with pytest.raises(ValueError, match="must return bool | int | float"):
|
|
113
|
+
ev.run()
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_criterion_crash_still_scores_zero(workspace_setup):
|
|
117
|
+
ev = workspace_setup["evaluator"]
|
|
118
|
+
output_path = workspace_setup["output_path"]
|
|
119
|
+
|
|
120
|
+
@ev.criterion("ok", weight=1.0)
|
|
121
|
+
def ok(ws):
|
|
122
|
+
return True
|
|
123
|
+
|
|
124
|
+
@ev.criterion("boom", weight=1.0)
|
|
125
|
+
def boom(ws):
|
|
126
|
+
raise RuntimeError("kaboom")
|
|
127
|
+
|
|
128
|
+
# A genuine crash is caught and scored 0.0 (not raised), unlike a bad return type.
|
|
129
|
+
ev.run()
|
|
130
|
+
reward_data = json.loads(output_path.read_text())
|
|
131
|
+
assert reward_data == {"reward": 0.5, "ok": 1.0, "boom": 0.0}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def test_fatal_criterion_passes(workspace_setup):
|
|
135
|
+
ev = workspace_setup["evaluator"]
|
|
136
|
+
output_path = workspace_setup["output_path"]
|
|
137
|
+
|
|
138
|
+
@ev.criterion("file_check", weight=1.0, fatal=True)
|
|
139
|
+
def check_file(ws):
|
|
140
|
+
return True
|
|
141
|
+
|
|
142
|
+
@ev.criterion("content_check", weight=1.0)
|
|
143
|
+
def check_content(ws):
|
|
144
|
+
return 0.8
|
|
145
|
+
|
|
146
|
+
ev.run()
|
|
147
|
+
|
|
148
|
+
reward_data = json.loads(output_path.read_text())
|
|
149
|
+
# Fatal criterion passed -> normal scoring continues
|
|
150
|
+
# (1.0 * 1.0 + 0.8 * 1.0) / 2.0 = 0.9
|
|
151
|
+
assert reward_data["reward"] == 0.9
|
|
152
|
+
assert reward_data == {"reward": 0.9, "file_check": 1.0, "content_check": 0.8}
|