graded 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graded/__init__.py +4 -0
- graded/evaluator.py +329 -0
- graded/types.py +145 -0
- graded-1.0.0.dist-info/METADATA +133 -0
- graded-1.0.0.dist-info/RECORD +7 -0
- graded-1.0.0.dist-info/WHEEL +5 -0
- graded-1.0.0.dist-info/top_level.txt +1 -0
graded/__init__.py
ADDED
graded/evaluator.py
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import shutil
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, Any, Dict, List, Optional, Type, Union
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from graded.types import Criterion, Trajectory
|
|
10
|
+
|
|
11
|
+
logging.basicConfig(
|
|
12
|
+
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Evaluator:
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
workspace: Union[str, Path] = "/workspace",
|
|
20
|
+
output_path: Union[str, Path] = "/logs/verifier/reward.json",
|
|
21
|
+
auto_save_artifacts: bool = True,
|
|
22
|
+
artifacts_dir: Optional[Union[str, Path]] = None,
|
|
23
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
24
|
+
):
|
|
25
|
+
self.workspace = Path(workspace)
|
|
26
|
+
self.output_path = Path(output_path)
|
|
27
|
+
self.auto_save_artifacts = auto_save_artifacts
|
|
28
|
+
self.artifacts_dir = (
|
|
29
|
+
Path(artifacts_dir)
|
|
30
|
+
if artifacts_dir
|
|
31
|
+
else self.output_path.parent / "artifacts"
|
|
32
|
+
)
|
|
33
|
+
self.metadata: Dict[str, Any] = metadata or {}
|
|
34
|
+
self.criteria: List[Criterion] = []
|
|
35
|
+
self.scores: Dict[str, float] = {}
|
|
36
|
+
self.traces: List[Dict[str, Any]] = []
|
|
37
|
+
|
|
38
|
+
def criterion(self, name: str, weight: float = 1.0, fatal: bool = False):
|
|
39
|
+
"""Decorator to declare a grading criterion.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
name: Name of the criterion.
|
|
43
|
+
weight: Relative weight for scoring.
|
|
44
|
+
fatal: If True, a score of 0.0 short-circuits the entire evaluation to 0.0.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def decorator(func: Callable[[Path], Any]):
|
|
48
|
+
if any(c.name == name for c in self.criteria):
|
|
49
|
+
raise ValueError(f"Duplicate criterion name: '{name}'")
|
|
50
|
+
self.criteria.append(
|
|
51
|
+
Criterion(name=name, weight=weight, fatal=fatal, func=func)
|
|
52
|
+
)
|
|
53
|
+
return func
|
|
54
|
+
|
|
55
|
+
return decorator
|
|
56
|
+
|
|
57
|
+
def _save_artifact(self, filename: str, content: str) -> None:
|
|
58
|
+
"""Internal helper to save content to the artifacts directory."""
|
|
59
|
+
try:
|
|
60
|
+
dest = self.artifacts_dir / filename
|
|
61
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
62
|
+
dest.write_text(content, encoding="utf-8")
|
|
63
|
+
except Exception as e:
|
|
64
|
+
logging.error(f"Failed to save artifact {filename}: {e}")
|
|
65
|
+
|
|
66
|
+
def save_file(self, filename: str, content: str) -> None:
|
|
67
|
+
"""Explicitly save content to the artifacts directory."""
|
|
68
|
+
self._save_artifact(filename, content)
|
|
69
|
+
|
|
70
|
+
def save_dir(self, dirname: str) -> None:
|
|
71
|
+
"""Copy an entire directory from the workspace to the artifacts directory."""
|
|
72
|
+
src = self.workspace / dirname
|
|
73
|
+
dest = self.artifacts_dir / dirname
|
|
74
|
+
if not src.is_dir():
|
|
75
|
+
logging.warning(f"Directory {dirname} not found in workspace.")
|
|
76
|
+
return
|
|
77
|
+
try:
|
|
78
|
+
if dest.exists():
|
|
79
|
+
shutil.rmtree(dest)
|
|
80
|
+
shutil.copytree(src, dest)
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logging.error(f"Failed to save directory artifact {dirname}: {e}")
|
|
83
|
+
|
|
84
|
+
def load_json(
|
|
85
|
+
self, filename: str, save_artifact: Optional[bool] = None
|
|
86
|
+
) -> Optional[Any]:
|
|
87
|
+
"""Safely loads and parses JSON from the workspace.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
filename: Path relative to the workspace.
|
|
91
|
+
save_artifact: Whether to save a copy to the artifacts directory.
|
|
92
|
+
Defaults to the instance-level auto_save_artifacts setting.
|
|
93
|
+
"""
|
|
94
|
+
path = self.workspace / filename
|
|
95
|
+
if not path.exists():
|
|
96
|
+
logging.warning(f"File {filename} not found in workspace.")
|
|
97
|
+
return None
|
|
98
|
+
try:
|
|
99
|
+
raw = path.read_text(encoding="utf-8")
|
|
100
|
+
should_save = (
|
|
101
|
+
save_artifact if save_artifact is not None else self.auto_save_artifacts
|
|
102
|
+
)
|
|
103
|
+
if should_save:
|
|
104
|
+
self._save_artifact(filename, raw)
|
|
105
|
+
return json.loads(raw)
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logging.error(f"Error parsing JSON file {filename}: {e}")
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
def read_file(
|
|
111
|
+
self, filename: str, save_artifact: Optional[bool] = None
|
|
112
|
+
) -> Optional[str]:
|
|
113
|
+
"""Safely reads file content from the workspace.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
filename: Path relative to the workspace.
|
|
117
|
+
save_artifact: Whether to save a copy to the artifacts directory.
|
|
118
|
+
Defaults to the instance-level auto_save_artifacts setting.
|
|
119
|
+
"""
|
|
120
|
+
path = self.workspace / filename
|
|
121
|
+
if not path.exists():
|
|
122
|
+
logging.warning(f"File {filename} not found in workspace.")
|
|
123
|
+
return None
|
|
124
|
+
try:
|
|
125
|
+
content = path.read_text(encoding="utf-8")
|
|
126
|
+
should_save = (
|
|
127
|
+
save_artifact if save_artifact is not None else self.auto_save_artifacts
|
|
128
|
+
)
|
|
129
|
+
if should_save:
|
|
130
|
+
self._save_artifact(filename, content)
|
|
131
|
+
return content
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logging.error(f"Error reading file {filename}: {e}")
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
def load_trajectory(
|
|
137
|
+
self, path: str = "/logs/agent/trajectory.json"
|
|
138
|
+
) -> Optional[Trajectory]:
|
|
139
|
+
"""Load and parse the ATIF trajectory written by the agent.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
path: Absolute path to the trajectory JSON file.
|
|
143
|
+
Defaults to ``/logs/agent/trajectory.json``.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
A typed :class:`Trajectory` on success, or ``None`` if the file is
|
|
147
|
+
missing or unparseable (a warning is logged in either case).
|
|
148
|
+
"""
|
|
149
|
+
traj_path = Path(path)
|
|
150
|
+
if not traj_path.exists():
|
|
151
|
+
logging.warning(f"Trajectory file not found: {path}")
|
|
152
|
+
return None
|
|
153
|
+
try:
|
|
154
|
+
return Trajectory.model_validate_json(traj_path.read_text(encoding="utf-8"))
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logging.error(f"Failed to parse trajectory at {path}: {e}")
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
def file_exists(self, filename: str) -> bool:
|
|
160
|
+
"""Checks if a file exists in the workspace."""
|
|
161
|
+
path = self.workspace / filename
|
|
162
|
+
return path.is_file()
|
|
163
|
+
|
|
164
|
+
def dir_exists(self, dirname: str) -> bool:
|
|
165
|
+
"""Checks if a directory exists in the workspace."""
|
|
166
|
+
path = self.workspace / dirname
|
|
167
|
+
return path.is_dir()
|
|
168
|
+
|
|
169
|
+
def llm_judge(
|
|
170
|
+
self,
|
|
171
|
+
response_model: Type[BaseModel],
|
|
172
|
+
system: str,
|
|
173
|
+
prompt: str,
|
|
174
|
+
model: str,
|
|
175
|
+
client: Optional[Any] = None,
|
|
176
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
177
|
+
**kwargs,
|
|
178
|
+
) -> Any:
|
|
179
|
+
"""Call instructor LLM judge with structured responses and trace the call.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
response_model: Pydantic model for structured output.
|
|
183
|
+
system: System prompt text.
|
|
184
|
+
prompt: User prompt text.
|
|
185
|
+
model: Model identifier (e.g. "google/gemini-3.1-flash-lite").
|
|
186
|
+
client: Optional pre-configured instructor client.
|
|
187
|
+
metadata: Optional per-call metadata dict. Merged with evaluator-level
|
|
188
|
+
metadata for experiment tracking.
|
|
189
|
+
**kwargs: Additional arguments passed to client.create().
|
|
190
|
+
"""
|
|
191
|
+
import instructor
|
|
192
|
+
|
|
193
|
+
call_metadata = metadata or {}
|
|
194
|
+
|
|
195
|
+
# Merge metadata: evaluator-level -> per-call
|
|
196
|
+
merged_metadata = {
|
|
197
|
+
**self.metadata,
|
|
198
|
+
**call_metadata,
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
trace_data = {
|
|
202
|
+
"model": model,
|
|
203
|
+
"system": system,
|
|
204
|
+
"prompt": prompt,
|
|
205
|
+
"kwargs": {k: repr(v) for k, v in kwargs.items()},
|
|
206
|
+
"response_model_schema": response_model.model_json_schema(),
|
|
207
|
+
"metadata": merged_metadata,
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
if client is None:
|
|
212
|
+
client = instructor.from_provider(model=model)
|
|
213
|
+
|
|
214
|
+
model_name = model.split("/")[-1]
|
|
215
|
+
result = client.create(
|
|
216
|
+
model=model_name,
|
|
217
|
+
response_model=response_model,
|
|
218
|
+
messages=[
|
|
219
|
+
{"role": "system", "content": system},
|
|
220
|
+
{"role": "user", "content": prompt},
|
|
221
|
+
],
|
|
222
|
+
**kwargs,
|
|
223
|
+
)
|
|
224
|
+
# Record trace on success
|
|
225
|
+
success_trace = {
|
|
226
|
+
**trace_data,
|
|
227
|
+
"response": result.model_dump(),
|
|
228
|
+
"status": "success",
|
|
229
|
+
}
|
|
230
|
+
self.traces.append(success_trace)
|
|
231
|
+
return result
|
|
232
|
+
except Exception as e:
|
|
233
|
+
# Record trace on failure
|
|
234
|
+
failed_trace = {
|
|
235
|
+
**trace_data,
|
|
236
|
+
"error": str(e),
|
|
237
|
+
"status": "failed",
|
|
238
|
+
}
|
|
239
|
+
self.traces.append(failed_trace)
|
|
240
|
+
raise e
|
|
241
|
+
|
|
242
|
+
def _score_criterion(self, crit: Criterion) -> float:
|
|
243
|
+
"""Run a single criterion and coerce its result to a float score.
|
|
244
|
+
|
|
245
|
+
A crash inside the criterion is caught and scored 0.0. A return value
|
|
246
|
+
that is not ``bool | int | float`` raises ``ValueError`` (a likely
|
|
247
|
+
forgotten ``return``).
|
|
248
|
+
"""
|
|
249
|
+
try:
|
|
250
|
+
res = crit.func(self.workspace)
|
|
251
|
+
except Exception as e:
|
|
252
|
+
logging.error(
|
|
253
|
+
f"Failed executing criterion '{crit.name}': {e}", exc_info=True
|
|
254
|
+
)
|
|
255
|
+
print(
|
|
256
|
+
f"CRITERION: {crit.name} (weight={crit.weight}) -> FAILED (Score: 0.0)"
|
|
257
|
+
)
|
|
258
|
+
return 0.0
|
|
259
|
+
|
|
260
|
+
if not isinstance(res, (bool, int, float)):
|
|
261
|
+
raise ValueError(
|
|
262
|
+
f"Criterion '{crit.name}' must return bool | int | float, "
|
|
263
|
+
f"got {type(res).__name__}. Did you forget a return?"
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
score = float(res) # float(True) == 1.0, float(False) == 0.0
|
|
267
|
+
print(f"CRITERION: {crit.name} (weight={crit.weight}) -> Score: {score}")
|
|
268
|
+
return score
|
|
269
|
+
|
|
270
|
+
def run(self):
|
|
271
|
+
"""Executes all criteria, aggregates weighted scores, and writes outputs."""
|
|
272
|
+
total_weight = 0.0
|
|
273
|
+
weighted_score = 0.0
|
|
274
|
+
|
|
275
|
+
print("=== Start Evaluation ===")
|
|
276
|
+
for crit in self.criteria:
|
|
277
|
+
total_weight += crit.weight
|
|
278
|
+
score = self._score_criterion(crit)
|
|
279
|
+
self.scores[crit.name] = score
|
|
280
|
+
weighted_score += score * crit.weight
|
|
281
|
+
|
|
282
|
+
if crit.fatal and score == 0.0:
|
|
283
|
+
print(
|
|
284
|
+
f"FATAL: Criterion '{crit.name}' failed. Short-circuiting to 0.0."
|
|
285
|
+
)
|
|
286
|
+
self._write_outputs(0.0)
|
|
287
|
+
return
|
|
288
|
+
|
|
289
|
+
final_reward = (weighted_score / total_weight) if total_weight > 0 else 0.0
|
|
290
|
+
self._write_outputs(final_reward)
|
|
291
|
+
|
|
292
|
+
def _write_outputs(self, final_reward: float):
|
|
293
|
+
"""Write all output files (reward, traces)."""
|
|
294
|
+
print(f"Final Computed Reward: {final_reward:.4f}")
|
|
295
|
+
|
|
296
|
+
# Ensure output directories exist
|
|
297
|
+
self.output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
298
|
+
|
|
299
|
+
# Write legacy reward.txt format
|
|
300
|
+
reward_txt = self.output_path.with_name("reward.txt")
|
|
301
|
+
try:
|
|
302
|
+
reward_txt.write_text(f"{final_reward:.4f}\n")
|
|
303
|
+
except Exception as e:
|
|
304
|
+
logging.error(f"Failed to write reward.txt: {e}")
|
|
305
|
+
|
|
306
|
+
# Write structured reward.json — flat dict[str, float|int] for Harbor compatibility
|
|
307
|
+
# Each criterion score is a top-level key alongside 'reward'
|
|
308
|
+
output_data = {"reward": final_reward, **self.scores}
|
|
309
|
+
try:
|
|
310
|
+
self.output_path.write_text(json.dumps(output_data, indent=2))
|
|
311
|
+
except Exception as e:
|
|
312
|
+
logging.error(f"Failed to write reward.json: {e}")
|
|
313
|
+
|
|
314
|
+
# Write metadata.json if metadata was provided
|
|
315
|
+
if self.metadata:
|
|
316
|
+
metadata_path = self.output_path.parent / "metadata.json"
|
|
317
|
+
try:
|
|
318
|
+
metadata_path.write_text(json.dumps(self.metadata, indent=2))
|
|
319
|
+
except Exception as e:
|
|
320
|
+
logging.error(f"Failed to write metadata.json: {e}")
|
|
321
|
+
|
|
322
|
+
# Write LLM judge traces
|
|
323
|
+
traces_path = self.output_path.parent / "traces.json"
|
|
324
|
+
try:
|
|
325
|
+
traces_path.write_text(json.dumps(self.traces, indent=2))
|
|
326
|
+
except Exception as e:
|
|
327
|
+
logging.error(f"Failed to write traces.json: {e}")
|
|
328
|
+
|
|
329
|
+
print("=== Evaluation Finished ===")
|
graded/types.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Shared types for graded verifiers.
|
|
2
|
+
|
|
3
|
+
Contains the grading :class:`Criterion` and the lightweight ATIF trajectory
|
|
4
|
+
types (:class:`Trajectory`, :class:`Step`, :class:`ToolCall`). The trajectory
|
|
5
|
+
types are intentionally minimal read-only mirrors of the ATIF types defined in
|
|
6
|
+
``harbor.models.trajectories``. They use ``extra="ignore"`` throughout so that
|
|
7
|
+
forward-compatible ATIF schema additions (new fields, new versions) never cause
|
|
8
|
+
parse failures in verifier scripts.
|
|
9
|
+
|
|
10
|
+
Use ``Evaluator.load_trajectory()`` to get a typed ``Trajectory`` from the
|
|
11
|
+
agent log written during a trial.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Callable
|
|
18
|
+
|
|
19
|
+
from pydantic import BaseModel, Field
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Criterion(BaseModel):
|
|
23
|
+
"""A single grading criterion registered via ``Evaluator.criterion``."""
|
|
24
|
+
|
|
25
|
+
name: str
|
|
26
|
+
weight: float = 1.0
|
|
27
|
+
fatal: bool = False
|
|
28
|
+
func: Callable[[Path], Any]
|
|
29
|
+
|
|
30
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ToolCall(BaseModel):
|
|
34
|
+
"""A single tool invocation within an agent step."""
|
|
35
|
+
|
|
36
|
+
tool_call_id: str
|
|
37
|
+
function_name: str
|
|
38
|
+
arguments: dict[str, Any] = Field(default_factory=dict)
|
|
39
|
+
|
|
40
|
+
model_config = {"extra": "ignore"}
|
|
41
|
+
|
|
42
|
+
def arg(self, key: str, default: Any = None) -> Any:
|
|
43
|
+
"""Convenience accessor for a single argument value."""
|
|
44
|
+
return self.arguments.get(key, default)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Step(BaseModel):
|
|
48
|
+
"""One turn in the agent trajectory."""
|
|
49
|
+
|
|
50
|
+
step_id: int
|
|
51
|
+
source: str # "user" | "agent" | "system"
|
|
52
|
+
message: Any = "" # str or list[ContentPart] — we accept either
|
|
53
|
+
tool_calls: list[ToolCall] | None = None
|
|
54
|
+
|
|
55
|
+
model_config = {"extra": "ignore"}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class Trajectory(BaseModel):
|
|
59
|
+
"""Parsed ATIF trajectory. Exposes a small query API for verifier use."""
|
|
60
|
+
|
|
61
|
+
schema_version: str = ""
|
|
62
|
+
session_id: str | None = None
|
|
63
|
+
steps: list[Step] = Field(default_factory=list)
|
|
64
|
+
|
|
65
|
+
model_config = {"extra": "ignore"}
|
|
66
|
+
|
|
67
|
+
# ------------------------------------------------------------------
|
|
68
|
+
# Query primitives
|
|
69
|
+
# ------------------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
def all_tool_calls(self) -> list[ToolCall]:
|
|
72
|
+
"""Flat list of every tool call made across all agent steps."""
|
|
73
|
+
return [
|
|
74
|
+
tc
|
|
75
|
+
for step in self.steps
|
|
76
|
+
if step.tool_calls
|
|
77
|
+
for tc in step.tool_calls
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
def tool_calls_for(self, function_name: str) -> list[ToolCall]:
|
|
81
|
+
"""All tool calls whose ``function_name`` matches exactly.
|
|
82
|
+
|
|
83
|
+
Equivalent to ``find_all(function_name)``.
|
|
84
|
+
"""
|
|
85
|
+
return self.find_all(function_name)
|
|
86
|
+
|
|
87
|
+
def exists(
|
|
88
|
+
self,
|
|
89
|
+
function_name: str,
|
|
90
|
+
predicate: Callable[[ToolCall], bool] | None = None,
|
|
91
|
+
) -> bool:
|
|
92
|
+
"""Return ``True`` if any tool call matches ``function_name`` and,
|
|
93
|
+
optionally, satisfies ``predicate``.
|
|
94
|
+
|
|
95
|
+
Examples::
|
|
96
|
+
|
|
97
|
+
trajectory.exists("write_file")
|
|
98
|
+
trajectory.exists(
|
|
99
|
+
"write_file",
|
|
100
|
+
lambda tc: PurePosixPath(tc.arg("path", "")).name == "blog_post.md",
|
|
101
|
+
)
|
|
102
|
+
trajectory.exists("terminal", lambda tc: "pytest" in tc.arg("command", ""))
|
|
103
|
+
"""
|
|
104
|
+
return self.find(function_name, predicate) is not None
|
|
105
|
+
|
|
106
|
+
def find(
|
|
107
|
+
self,
|
|
108
|
+
function_name: str,
|
|
109
|
+
predicate: Callable[[ToolCall], bool] | None = None,
|
|
110
|
+
) -> ToolCall | None:
|
|
111
|
+
"""Return the first :class:`ToolCall` matching ``function_name`` (and
|
|
112
|
+
``predicate`` if given), or ``None`` if no match is found.
|
|
113
|
+
|
|
114
|
+
Example::
|
|
115
|
+
|
|
116
|
+
tc = trajectory.find("write_file")
|
|
117
|
+
if tc:
|
|
118
|
+
print(tc.arg("path"))
|
|
119
|
+
"""
|
|
120
|
+
for tc in self.all_tool_calls():
|
|
121
|
+
if tc.function_name != function_name:
|
|
122
|
+
continue
|
|
123
|
+
if predicate is None or predicate(tc):
|
|
124
|
+
return tc
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
def find_all(
|
|
128
|
+
self,
|
|
129
|
+
function_name: str,
|
|
130
|
+
predicate: Callable[[ToolCall], bool] | None = None,
|
|
131
|
+
) -> list[ToolCall]:
|
|
132
|
+
"""Return all :class:`ToolCall` objects matching ``function_name`` (and
|
|
133
|
+
``predicate`` if given).
|
|
134
|
+
|
|
135
|
+
Example::
|
|
136
|
+
|
|
137
|
+
calls = trajectory.find_all("write_file")
|
|
138
|
+
paths = [tc.arg("path") for tc in calls]
|
|
139
|
+
"""
|
|
140
|
+
return [
|
|
141
|
+
tc
|
|
142
|
+
for tc in self.all_tool_calls()
|
|
143
|
+
if tc.function_name == function_name
|
|
144
|
+
and (predicate is None or predicate(tc))
|
|
145
|
+
]
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: graded
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Defensive verifier framework and helpers for Harbor evaluations
|
|
5
|
+
Classifier: Programming Language :: Python :: 3
|
|
6
|
+
Classifier: Operating System :: OS Independent
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: pydantic>=2.0
|
|
10
|
+
Requires-Dist: instructor>=1.0.0
|
|
11
|
+
Requires-Dist: jsonref>=1.1.0
|
|
12
|
+
Requires-Dist: google-genai>=1.47.0
|
|
13
|
+
|
|
14
|
+
# graded 🍳
|
|
15
|
+
|
|
16
|
+
`graded` is a defensive verifier and grading framework designed for agent evaluations (particularly for Harbor agent evaluations). It allows you to declare structured grading criteria, leverage LLM judges with automatic tracing, and safely manage evaluation artifacts.
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
Install `graded` directly from PyPI (or your internal registry):
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install graded
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Or with `uv`:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
uv pip install graded
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Quick Start
|
|
37
|
+
|
|
38
|
+
Create an evaluation script (e.g. `verify.py`) to grade a task workspace:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from pathlib import Path
|
|
42
|
+
from graded import Evaluator
|
|
43
|
+
|
|
44
|
+
# Initialize the evaluator
|
|
45
|
+
ev = Evaluator(
|
|
46
|
+
workspace="/workspace",
|
|
47
|
+
output_path="/logs/verifier/reward.json",
|
|
48
|
+
auto_save_artifacts=True
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# 1. Declare a standard criterion
|
|
52
|
+
@ev.criterion(name="has_output_file", weight=1.0)
|
|
53
|
+
def check_output(workspace: Path) -> bool:
|
|
54
|
+
return (workspace / "output.txt").is_file()
|
|
55
|
+
|
|
56
|
+
# 2. Declare a fatal criterion (short-circuits score to 0.0 if failed)
|
|
57
|
+
@ev.criterion(name="no_syntax_errors", weight=2.0, fatal=True)
|
|
58
|
+
def check_syntax(workspace: Path) -> bool:
|
|
59
|
+
# return True or False (or float 0.0 - 1.0)
|
|
60
|
+
return True
|
|
61
|
+
|
|
62
|
+
# 3. Declare a fractional scoring criterion
|
|
63
|
+
@ev.criterion(name="test_pass_rate", weight=3.0)
|
|
64
|
+
def check_tests(workspace: Path) -> float:
|
|
65
|
+
# Returns a score between 0.0 and 1.0
|
|
66
|
+
return 0.8 # e.g., 80% of tests passed
|
|
67
|
+
|
|
68
|
+
# Run the evaluation and write outputs
|
|
69
|
+
if __name__ == "__main__":
|
|
70
|
+
ev.run()
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Core Features
|
|
76
|
+
|
|
77
|
+
### 1. Criteria Declarations (`@ev.criterion`)
|
|
78
|
+
Define check functions using the `@ev.criterion` decorator.
|
|
79
|
+
- **`name`**: The unique identifier for the criterion.
|
|
80
|
+
- **`weight`**: Relative weight of the score in the final weighted average calculation.
|
|
81
|
+
- **`fatal`**: If set to `True`, any score of `0.0` or `False` immediately short-circuits the final score to `0.0`.
|
|
82
|
+
- **Return Value**: Must return a `bool`, `int`, or `float`. Anything else raises a `ValueError`.
|
|
83
|
+
|
|
84
|
+
### 2. LLM Judge with Automatic Tracing
|
|
85
|
+
`graded` integrates with `instructor` to run structured, schema-validated LLM grading prompts, automatically logging prompt, parameters, response schema, and LLM responses to `traces.json`.
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from pydantic import BaseModel, Field
|
|
89
|
+
|
|
90
|
+
class Rubric(BaseModel):
|
|
91
|
+
score: float = Field(description="Score between 0.0 and 1.0 based on correctness.")
|
|
92
|
+
reasoning: str = Field(description="Detailed reasoning for the score.")
|
|
93
|
+
|
|
94
|
+
# In your criterion:
|
|
95
|
+
result = ev.llm_judge(
|
|
96
|
+
model="google/gemini-3.5-flash",
|
|
97
|
+
response_model=Rubric,
|
|
98
|
+
system="You are a strict code correctness evaluator.",
|
|
99
|
+
prompt="Compare the student's solution in code.py with the requirements...",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
print(f"LLM Score: {result.score}")
|
|
103
|
+
print(f"Reasoning: {result.reasoning}")
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### 3. File & Artifact Management
|
|
107
|
+
Safely access files and copy evaluation artifacts to the logs directory for post-evaluation review.
|
|
108
|
+
|
|
109
|
+
- **`ev.read_file(filename)`**: Safely reads content as a string. Auto-saves a copy to artifacts.
|
|
110
|
+
- **`ev.load_json(filename)`**: Safely parses JSON file content. Auto-saves a copy to artifacts.
|
|
111
|
+
- **`ev.save_file(filename, content)`**: Save arbitrary text/data to the artifacts directory.
|
|
112
|
+
- **`ev.save_dir(dirname)`**: Copy an entire directory from the workspace to the artifacts directory.
|
|
113
|
+
- **`ev.load_trajectory(path)`**: Load and parse an agent's ATIF `trajectory.json` file into a typed `Trajectory` object.
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## Outputs
|
|
118
|
+
|
|
119
|
+
When `ev.run()` completes, the following files are written to the directory containing your configured `output_path`:
|
|
120
|
+
|
|
121
|
+
1. **`reward.json`**: A flat JSON dictionary containing the final calculated `reward` and the individual scores for each criterion:
|
|
122
|
+
```json
|
|
123
|
+
{
|
|
124
|
+
"reward": 0.75,
|
|
125
|
+
"has_output_file": 1.0,
|
|
126
|
+
"no_syntax_errors": 1.0,
|
|
127
|
+
"test_pass_rate": 0.8
|
|
128
|
+
}
|
|
129
|
+
```
|
|
130
|
+
2. **`reward.txt`**: A text file containing just the final reward float value (e.g. `0.7500\n`).
|
|
131
|
+
3. **`traces.json`**: A list of structured LLM calls made via `ev.llm_judge`, detailing inputs, responses, latencies, and metadata.
|
|
132
|
+
4. **`metadata.json`**: (Optional) Contains evaluator-level and run-level metadata.
|
|
133
|
+
5. **`artifacts/`**: Subfolder containing copy-back files preserved during the evaluation run.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
graded/__init__.py,sha256=Kfb66m1tMq79DHJAqNnIPZgAyE8pcVVk16VcM0D2kF8,174
|
|
2
|
+
graded/evaluator.py,sha256=Mt0tJLe_fHVgnq6c1mDCtvSiwn8yhQrisYRTl7T5nWw,12141
|
|
3
|
+
graded/types.py,sha256=sYQBQmnwaLPQK1CYiei0T4rtMfuEMBjRWgl-ETe2WxI,4478
|
|
4
|
+
graded-1.0.0.dist-info/METADATA,sha256=sSKpwxQX3tJoIgZ8TWOQT-saTLEqfdikj3Xy1IZrE7Q,4650
|
|
5
|
+
graded-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
6
|
+
graded-1.0.0.dist-info/top_level.txt,sha256=I4zw-dv35tCOLJzWEziFD8e8LehVwqPoCwaPzUzy8R0,7
|
|
7
|
+
graded-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
graded
|