graded 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
graded-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,133 @@
1
+ Metadata-Version: 2.4
2
+ Name: graded
3
+ Version: 1.0.0
4
+ Summary: Defensive verifier framework and helpers for Harbor evaluations
5
+ Classifier: Programming Language :: Python :: 3
6
+ Classifier: Operating System :: OS Independent
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: pydantic>=2.0
10
+ Requires-Dist: instructor>=1.0.0
11
+ Requires-Dist: jsonref>=1.1.0
12
+ Requires-Dist: google-genai>=1.47.0
13
+
14
+ # graded 🍳
15
+
16
+ `graded` is a defensive verifier and grading framework designed for agent evaluations (particularly for Harbor agent evaluations). It allows you to declare structured grading criteria, leverage LLM judges with automatic tracing, and safely manage evaluation artifacts.
17
+
18
+ ---
19
+
20
+ ## Installation
21
+
22
+ Install `graded` directly from PyPI (or your internal registry):
23
+
24
+ ```bash
25
+ pip install graded
26
+ ```
27
+
28
+ Or with `uv`:
29
+
30
+ ```bash
31
+ uv pip install graded
32
+ ```
33
+
34
+ ---
35
+
36
+ ## Quick Start
37
+
38
+ Create an evaluation script (e.g. `verify.py`) to grade a task workspace:
39
+
40
+ ```python
41
+ from pathlib import Path
42
+ from graded import Evaluator
43
+
44
+ # Initialize the evaluator
45
+ ev = Evaluator(
46
+ workspace="/workspace",
47
+ output_path="/logs/verifier/reward.json",
48
+ auto_save_artifacts=True
49
+ )
50
+
51
+ # 1. Declare a standard criterion
52
+ @ev.criterion(name="has_output_file", weight=1.0)
53
+ def check_output(workspace: Path) -> bool:
54
+ return (workspace / "output.txt").is_file()
55
+
56
+ # 2. Declare a fatal criterion (short-circuits score to 0.0 if failed)
57
+ @ev.criterion(name="no_syntax_errors", weight=2.0, fatal=True)
58
+ def check_syntax(workspace: Path) -> bool:
59
+ # return True or False (or float 0.0 - 1.0)
60
+ return True
61
+
62
+ # 3. Declare a fractional scoring criterion
63
+ @ev.criterion(name="test_pass_rate", weight=3.0)
64
+ def check_tests(workspace: Path) -> float:
65
+ # Returns a score between 0.0 and 1.0
66
+ return 0.8 # e.g., 80% of tests passed
67
+
68
+ # Run the evaluation and write outputs
69
+ if __name__ == "__main__":
70
+ ev.run()
71
+ ```
72
+
73
+ ---
74
+
75
+ ## Core Features
76
+
77
+ ### 1. Criteria Declarations (`@ev.criterion`)
78
+ Define check functions using the `@ev.criterion` decorator.
79
+ - **`name`**: The unique identifier for the criterion.
80
+ - **`weight`**: Relative weight of the score in the final weighted average calculation.
81
+ - **`fatal`**: If set to `True`, any score of `0.0` or `False` immediately short-circuits the final score to `0.0`.
82
+ - **Return Value**: Must return a `bool`, `int`, or `float`. Anything else raises a `ValueError`.
83
+
84
+ ### 2. LLM Judge with Automatic Tracing
85
+ `graded` integrates with `instructor` to run structured, schema-validated LLM grading prompts, automatically logging prompt, parameters, response schema, and LLM responses to `traces.json`.
86
+
87
+ ```python
88
+ from pydantic import BaseModel, Field
89
+
90
+ class Rubric(BaseModel):
91
+ score: float = Field(description="Score between 0.0 and 1.0 based on correctness.")
92
+ reasoning: str = Field(description="Detailed reasoning for the score.")
93
+
94
+ # In your criterion:
95
+ result = ev.llm_judge(
96
+ model="google/gemini-3.5-flash",
97
+ response_model=Rubric,
98
+ system="You are a strict code correctness evaluator.",
99
+ prompt="Compare the student's solution in code.py with the requirements...",
100
+ )
101
+
102
+ print(f"LLM Score: {result.score}")
103
+ print(f"Reasoning: {result.reasoning}")
104
+ ```
105
+
106
+ ### 3. File & Artifact Management
107
+ Safely access files and copy evaluation artifacts to the logs directory for post-evaluation review.
108
+
109
+ - **`ev.read_file(filename)`**: Safely reads content as a string. Auto-saves a copy to artifacts.
110
+ - **`ev.load_json(filename)`**: Safely parses JSON file content. Auto-saves a copy to artifacts.
111
+ - **`ev.save_file(filename, content)`**: Save arbitrary text/data to the artifacts directory.
112
+ - **`ev.save_dir(dirname)`**: Copy an entire directory from the workspace to the artifacts directory.
113
+ - **`ev.load_trajectory(path)`**: Load and parse an agent's ATIF `trajectory.json` file into a typed `Trajectory` object.
114
+
115
+ ---
116
+
117
+ ## Outputs
118
+
119
+ When `ev.run()` completes, the following files are written to the directory containing your configured `output_path`:
120
+
121
+ 1. **`reward.json`**: A flat JSON dictionary containing the final calculated `reward` and the individual scores for each criterion:
122
+ ```json
123
+ {
124
+ "reward": 0.75,
125
+ "has_output_file": 1.0,
126
+ "no_syntax_errors": 1.0,
127
+ "test_pass_rate": 0.8
128
+ }
129
+ ```
130
+ 2. **`reward.txt`**: A text file containing just the final reward float value (e.g. `0.7500\n`).
131
+ 3. **`traces.json`**: A list of structured LLM calls made via `ev.llm_judge`, detailing inputs, responses, latencies, and metadata.
132
+ 4. **`metadata.json`**: (Optional) Contains evaluator-level and run-level metadata.
133
+ 5. **`artifacts/`**: Subfolder containing copy-back files preserved during the evaluation run.
graded-1.0.0/README.md ADDED
@@ -0,0 +1,120 @@
1
+ # graded 🍳
2
+
3
+ `graded` is a defensive verifier and grading framework designed for agent evaluations (particularly for Harbor agent evaluations). It allows you to declare structured grading criteria, leverage LLM judges with automatic tracing, and safely manage evaluation artifacts.
4
+
5
+ ---
6
+
7
+ ## Installation
8
+
9
+ Install `graded` directly from PyPI (or your internal registry):
10
+
11
+ ```bash
12
+ pip install graded
13
+ ```
14
+
15
+ Or with `uv`:
16
+
17
+ ```bash
18
+ uv pip install graded
19
+ ```
20
+
21
+ ---
22
+
23
+ ## Quick Start
24
+
25
+ Create an evaluation script (e.g. `verify.py`) to grade a task workspace:
26
+
27
+ ```python
28
+ from pathlib import Path
29
+ from graded import Evaluator
30
+
31
+ # Initialize the evaluator
32
+ ev = Evaluator(
33
+ workspace="/workspace",
34
+ output_path="/logs/verifier/reward.json",
35
+ auto_save_artifacts=True
36
+ )
37
+
38
+ # 1. Declare a standard criterion
39
+ @ev.criterion(name="has_output_file", weight=1.0)
40
+ def check_output(workspace: Path) -> bool:
41
+ return (workspace / "output.txt").is_file()
42
+
43
+ # 2. Declare a fatal criterion (short-circuits score to 0.0 if failed)
44
+ @ev.criterion(name="no_syntax_errors", weight=2.0, fatal=True)
45
+ def check_syntax(workspace: Path) -> bool:
46
+ # return True or False (or float 0.0 - 1.0)
47
+ return True
48
+
49
+ # 3. Declare a fractional scoring criterion
50
+ @ev.criterion(name="test_pass_rate", weight=3.0)
51
+ def check_tests(workspace: Path) -> float:
52
+ # Returns a score between 0.0 and 1.0
53
+ return 0.8 # e.g., 80% of tests passed
54
+
55
+ # Run the evaluation and write outputs
56
+ if __name__ == "__main__":
57
+ ev.run()
58
+ ```
59
+
60
+ ---
61
+
62
+ ## Core Features
63
+
64
+ ### 1. Criteria Declarations (`@ev.criterion`)
65
+ Define check functions using the `@ev.criterion` decorator.
66
+ - **`name`**: The unique identifier for the criterion.
67
+ - **`weight`**: Relative weight of the score in the final weighted average calculation.
68
+ - **`fatal`**: If set to `True`, any score of `0.0` or `False` immediately short-circuits the final score to `0.0`.
69
+ - **Return Value**: Must return a `bool`, `int`, or `float`. Anything else raises a `ValueError`.
70
+
71
+ ### 2. LLM Judge with Automatic Tracing
72
+ `graded` integrates with `instructor` to run structured, schema-validated LLM grading prompts, automatically logging prompt, parameters, response schema, and LLM responses to `traces.json`.
73
+
74
+ ```python
75
+ from pydantic import BaseModel, Field
76
+
77
+ class Rubric(BaseModel):
78
+ score: float = Field(description="Score between 0.0 and 1.0 based on correctness.")
79
+ reasoning: str = Field(description="Detailed reasoning for the score.")
80
+
81
+ # In your criterion:
82
+ result = ev.llm_judge(
83
+ model="google/gemini-3.5-flash",
84
+ response_model=Rubric,
85
+ system="You are a strict code correctness evaluator.",
86
+ prompt="Compare the student's solution in code.py with the requirements...",
87
+ )
88
+
89
+ print(f"LLM Score: {result.score}")
90
+ print(f"Reasoning: {result.reasoning}")
91
+ ```
92
+
93
+ ### 3. File & Artifact Management
94
+ Safely access files and copy evaluation artifacts to the logs directory for post-evaluation review.
95
+
96
+ - **`ev.read_file(filename)`**: Safely reads content as a string. Auto-saves a copy to artifacts.
97
+ - **`ev.load_json(filename)`**: Safely parses JSON file content. Auto-saves a copy to artifacts.
98
+ - **`ev.save_file(filename, content)`**: Save arbitrary text/data to the artifacts directory.
99
+ - **`ev.save_dir(dirname)`**: Copy an entire directory from the workspace to the artifacts directory.
100
+ - **`ev.load_trajectory(path)`**: Load and parse an agent's ATIF `trajectory.json` file into a typed `Trajectory` object.
101
+
102
+ ---
103
+
104
+ ## Outputs
105
+
106
+ When `ev.run()` completes, the following files are written to the directory containing your configured `output_path`:
107
+
108
+ 1. **`reward.json`**: A flat JSON dictionary containing the final calculated `reward` and the individual scores for each criterion:
109
+ ```json
110
+ {
111
+ "reward": 0.75,
112
+ "has_output_file": 1.0,
113
+ "no_syntax_errors": 1.0,
114
+ "test_pass_rate": 0.8
115
+ }
116
+ ```
117
+ 2. **`reward.txt`**: A text file containing just the final reward float value (e.g. `0.7500\n`).
118
+ 3. **`traces.json`**: A list of structured LLM calls made via `ev.llm_judge`, detailing inputs, responses, latencies, and metadata.
119
+ 4. **`metadata.json`**: (Optional) Contains evaluator-level and run-level metadata.
120
+ 5. **`artifacts/`**: Subfolder containing copy-back files preserved during the evaluation run.
@@ -0,0 +1,23 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "graded"
7
+ version = "1.0.0"
8
+ description = "Defensive verifier framework and helpers for Harbor evaluations"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ classifiers = [
12
+ "Programming Language :: Python :: 3",
13
+ "Operating System :: OS Independent",
14
+ ]
15
+ dependencies = [
16
+ "pydantic>=2.0",
17
+ "instructor>=1.0.0",
18
+ "jsonref>=1.1.0",
19
+ "google-genai>=1.47.0",
20
+ ]
21
+
22
+ [tool.setuptools.packages.find]
23
+ where = ["src"]
graded-1.0.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,4 @@
1
+ from graded.evaluator import Evaluator
2
+ from graded.types import Criterion, Trajectory, ToolCall, Step
3
+
4
+ __all__ = ["Evaluator", "Criterion", "Trajectory", "ToolCall", "Step"]
@@ -0,0 +1,329 @@
1
+ import os
2
+ import json
3
+ import shutil
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import Callable, Any, Dict, List, Optional, Type, Union
7
+ from pydantic import BaseModel
8
+
9
+ from graded.types import Criterion, Trajectory
10
+
11
+ logging.basicConfig(
12
+ level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
13
+ )
14
+
15
+
16
+ class Evaluator:
17
+ def __init__(
18
+ self,
19
+ workspace: Union[str, Path] = "/workspace",
20
+ output_path: Union[str, Path] = "/logs/verifier/reward.json",
21
+ auto_save_artifacts: bool = True,
22
+ artifacts_dir: Optional[Union[str, Path]] = None,
23
+ metadata: Optional[Dict[str, Any]] = None,
24
+ ):
25
+ self.workspace = Path(workspace)
26
+ self.output_path = Path(output_path)
27
+ self.auto_save_artifacts = auto_save_artifacts
28
+ self.artifacts_dir = (
29
+ Path(artifacts_dir)
30
+ if artifacts_dir
31
+ else self.output_path.parent / "artifacts"
32
+ )
33
+ self.metadata: Dict[str, Any] = metadata or {}
34
+ self.criteria: List[Criterion] = []
35
+ self.scores: Dict[str, float] = {}
36
+ self.traces: List[Dict[str, Any]] = []
37
+
38
+ def criterion(self, name: str, weight: float = 1.0, fatal: bool = False):
39
+ """Decorator to declare a grading criterion.
40
+
41
+ Args:
42
+ name: Name of the criterion.
43
+ weight: Relative weight for scoring.
44
+ fatal: If True, a score of 0.0 short-circuits the entire evaluation to 0.0.
45
+ """
46
+
47
+ def decorator(func: Callable[[Path], Any]):
48
+ if any(c.name == name for c in self.criteria):
49
+ raise ValueError(f"Duplicate criterion name: '{name}'")
50
+ self.criteria.append(
51
+ Criterion(name=name, weight=weight, fatal=fatal, func=func)
52
+ )
53
+ return func
54
+
55
+ return decorator
56
+
57
+ def _save_artifact(self, filename: str, content: str) -> None:
58
+ """Internal helper to save content to the artifacts directory."""
59
+ try:
60
+ dest = self.artifacts_dir / filename
61
+ dest.parent.mkdir(parents=True, exist_ok=True)
62
+ dest.write_text(content, encoding="utf-8")
63
+ except Exception as e:
64
+ logging.error(f"Failed to save artifact {filename}: {e}")
65
+
66
+ def save_file(self, filename: str, content: str) -> None:
67
+ """Explicitly save content to the artifacts directory."""
68
+ self._save_artifact(filename, content)
69
+
70
+ def save_dir(self, dirname: str) -> None:
71
+ """Copy an entire directory from the workspace to the artifacts directory."""
72
+ src = self.workspace / dirname
73
+ dest = self.artifacts_dir / dirname
74
+ if not src.is_dir():
75
+ logging.warning(f"Directory {dirname} not found in workspace.")
76
+ return
77
+ try:
78
+ if dest.exists():
79
+ shutil.rmtree(dest)
80
+ shutil.copytree(src, dest)
81
+ except Exception as e:
82
+ logging.error(f"Failed to save directory artifact {dirname}: {e}")
83
+
84
+ def load_json(
85
+ self, filename: str, save_artifact: Optional[bool] = None
86
+ ) -> Optional[Any]:
87
+ """Safely loads and parses JSON from the workspace.
88
+
89
+ Args:
90
+ filename: Path relative to the workspace.
91
+ save_artifact: Whether to save a copy to the artifacts directory.
92
+ Defaults to the instance-level auto_save_artifacts setting.
93
+ """
94
+ path = self.workspace / filename
95
+ if not path.exists():
96
+ logging.warning(f"File {filename} not found in workspace.")
97
+ return None
98
+ try:
99
+ raw = path.read_text(encoding="utf-8")
100
+ should_save = (
101
+ save_artifact if save_artifact is not None else self.auto_save_artifacts
102
+ )
103
+ if should_save:
104
+ self._save_artifact(filename, raw)
105
+ return json.loads(raw)
106
+ except Exception as e:
107
+ logging.error(f"Error parsing JSON file {filename}: {e}")
108
+ return None
109
+
110
+ def read_file(
111
+ self, filename: str, save_artifact: Optional[bool] = None
112
+ ) -> Optional[str]:
113
+ """Safely reads file content from the workspace.
114
+
115
+ Args:
116
+ filename: Path relative to the workspace.
117
+ save_artifact: Whether to save a copy to the artifacts directory.
118
+ Defaults to the instance-level auto_save_artifacts setting.
119
+ """
120
+ path = self.workspace / filename
121
+ if not path.exists():
122
+ logging.warning(f"File {filename} not found in workspace.")
123
+ return None
124
+ try:
125
+ content = path.read_text(encoding="utf-8")
126
+ should_save = (
127
+ save_artifact if save_artifact is not None else self.auto_save_artifacts
128
+ )
129
+ if should_save:
130
+ self._save_artifact(filename, content)
131
+ return content
132
+ except Exception as e:
133
+ logging.error(f"Error reading file {filename}: {e}")
134
+ return None
135
+
136
+ def load_trajectory(
137
+ self, path: str = "/logs/agent/trajectory.json"
138
+ ) -> Optional[Trajectory]:
139
+ """Load and parse the ATIF trajectory written by the agent.
140
+
141
+ Args:
142
+ path: Absolute path to the trajectory JSON file.
143
+ Defaults to ``/logs/agent/trajectory.json``.
144
+
145
+ Returns:
146
+ A typed :class:`Trajectory` on success, or ``None`` if the file is
147
+ missing or unparseable (a warning is logged in either case).
148
+ """
149
+ traj_path = Path(path)
150
+ if not traj_path.exists():
151
+ logging.warning(f"Trajectory file not found: {path}")
152
+ return None
153
+ try:
154
+ return Trajectory.model_validate_json(traj_path.read_text(encoding="utf-8"))
155
+ except Exception as e:
156
+ logging.error(f"Failed to parse trajectory at {path}: {e}")
157
+ return None
158
+
159
+ def file_exists(self, filename: str) -> bool:
160
+ """Checks if a file exists in the workspace."""
161
+ path = self.workspace / filename
162
+ return path.is_file()
163
+
164
+ def dir_exists(self, dirname: str) -> bool:
165
+ """Checks if a directory exists in the workspace."""
166
+ path = self.workspace / dirname
167
+ return path.is_dir()
168
+
169
+ def llm_judge(
170
+ self,
171
+ response_model: Type[BaseModel],
172
+ system: str,
173
+ prompt: str,
174
+ model: str,
175
+ client: Optional[Any] = None,
176
+ metadata: Optional[Dict[str, Any]] = None,
177
+ **kwargs,
178
+ ) -> Any:
179
+ """Call instructor LLM judge with structured responses and trace the call.
180
+
181
+ Args:
182
+ response_model: Pydantic model for structured output.
183
+ system: System prompt text.
184
+ prompt: User prompt text.
185
+ model: Model identifier (e.g. "google/gemini-3.1-flash-lite").
186
+ client: Optional pre-configured instructor client.
187
+ metadata: Optional per-call metadata dict. Merged with evaluator-level
188
+ metadata for experiment tracking.
189
+ **kwargs: Additional arguments passed to client.create().
190
+ """
191
+ import instructor
192
+
193
+ call_metadata = metadata or {}
194
+
195
+ # Merge metadata: evaluator-level -> per-call
196
+ merged_metadata = {
197
+ **self.metadata,
198
+ **call_metadata,
199
+ }
200
+
201
+ trace_data = {
202
+ "model": model,
203
+ "system": system,
204
+ "prompt": prompt,
205
+ "kwargs": {k: repr(v) for k, v in kwargs.items()},
206
+ "response_model_schema": response_model.model_json_schema(),
207
+ "metadata": merged_metadata,
208
+ }
209
+
210
+ try:
211
+ if client is None:
212
+ client = instructor.from_provider(model=model)
213
+
214
+ model_name = model.split("/")[-1]
215
+ result = client.create(
216
+ model=model_name,
217
+ response_model=response_model,
218
+ messages=[
219
+ {"role": "system", "content": system},
220
+ {"role": "user", "content": prompt},
221
+ ],
222
+ **kwargs,
223
+ )
224
+ # Record trace on success
225
+ success_trace = {
226
+ **trace_data,
227
+ "response": result.model_dump(),
228
+ "status": "success",
229
+ }
230
+ self.traces.append(success_trace)
231
+ return result
232
+ except Exception as e:
233
+ # Record trace on failure
234
+ failed_trace = {
235
+ **trace_data,
236
+ "error": str(e),
237
+ "status": "failed",
238
+ }
239
+ self.traces.append(failed_trace)
240
+ raise e
241
+
242
+ def _score_criterion(self, crit: Criterion) -> float:
243
+ """Run a single criterion and coerce its result to a float score.
244
+
245
+ A crash inside the criterion is caught and scored 0.0. A return value
246
+ that is not ``bool | int | float`` raises ``ValueError`` (a likely
247
+ forgotten ``return``).
248
+ """
249
+ try:
250
+ res = crit.func(self.workspace)
251
+ except Exception as e:
252
+ logging.error(
253
+ f"Failed executing criterion '{crit.name}': {e}", exc_info=True
254
+ )
255
+ print(
256
+ f"CRITERION: {crit.name} (weight={crit.weight}) -> FAILED (Score: 0.0)"
257
+ )
258
+ return 0.0
259
+
260
+ if not isinstance(res, (bool, int, float)):
261
+ raise ValueError(
262
+ f"Criterion '{crit.name}' must return bool | int | float, "
263
+ f"got {type(res).__name__}. Did you forget a return?"
264
+ )
265
+
266
+ score = float(res) # float(True) == 1.0, float(False) == 0.0
267
+ print(f"CRITERION: {crit.name} (weight={crit.weight}) -> Score: {score}")
268
+ return score
269
+
270
+ def run(self):
271
+ """Executes all criteria, aggregates weighted scores, and writes outputs."""
272
+ total_weight = 0.0
273
+ weighted_score = 0.0
274
+
275
+ print("=== Start Evaluation ===")
276
+ for crit in self.criteria:
277
+ total_weight += crit.weight
278
+ score = self._score_criterion(crit)
279
+ self.scores[crit.name] = score
280
+ weighted_score += score * crit.weight
281
+
282
+ if crit.fatal and score == 0.0:
283
+ print(
284
+ f"FATAL: Criterion '{crit.name}' failed. Short-circuiting to 0.0."
285
+ )
286
+ self._write_outputs(0.0)
287
+ return
288
+
289
+ final_reward = (weighted_score / total_weight) if total_weight > 0 else 0.0
290
+ self._write_outputs(final_reward)
291
+
292
+ def _write_outputs(self, final_reward: float):
293
+ """Write all output files (reward, traces)."""
294
+ print(f"Final Computed Reward: {final_reward:.4f}")
295
+
296
+ # Ensure output directories exist
297
+ self.output_path.parent.mkdir(parents=True, exist_ok=True)
298
+
299
+ # Write legacy reward.txt format
300
+ reward_txt = self.output_path.with_name("reward.txt")
301
+ try:
302
+ reward_txt.write_text(f"{final_reward:.4f}\n")
303
+ except Exception as e:
304
+ logging.error(f"Failed to write reward.txt: {e}")
305
+
306
+ # Write structured reward.json — flat dict[str, float|int] for Harbor compatibility
307
+ # Each criterion score is a top-level key alongside 'reward'
308
+ output_data = {"reward": final_reward, **self.scores}
309
+ try:
310
+ self.output_path.write_text(json.dumps(output_data, indent=2))
311
+ except Exception as e:
312
+ logging.error(f"Failed to write reward.json: {e}")
313
+
314
+ # Write metadata.json if metadata was provided
315
+ if self.metadata:
316
+ metadata_path = self.output_path.parent / "metadata.json"
317
+ try:
318
+ metadata_path.write_text(json.dumps(self.metadata, indent=2))
319
+ except Exception as e:
320
+ logging.error(f"Failed to write metadata.json: {e}")
321
+
322
+ # Write LLM judge traces
323
+ traces_path = self.output_path.parent / "traces.json"
324
+ try:
325
+ traces_path.write_text(json.dumps(self.traces, indent=2))
326
+ except Exception as e:
327
+ logging.error(f"Failed to write traces.json: {e}")
328
+
329
+ print("=== Evaluation Finished ===")
@@ -0,0 +1,145 @@
1
+ """Shared types for graded verifiers.
2
+
3
+ Contains the grading :class:`Criterion` and the lightweight ATIF trajectory
4
+ types (:class:`Trajectory`, :class:`Step`, :class:`ToolCall`). The trajectory
5
+ types are intentionally minimal read-only mirrors of the ATIF types defined in
6
+ ``harbor.models.trajectories``. They use ``extra="ignore"`` throughout so that
7
+ forward-compatible ATIF schema additions (new fields, new versions) never cause
8
+ parse failures in verifier scripts.
9
+
10
+ Use ``Evaluator.load_trajectory()`` to get a typed ``Trajectory`` from the
11
+ agent log written during a trial.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from pathlib import Path
17
+ from typing import Any, Callable
18
+
19
+ from pydantic import BaseModel, Field
20
+
21
+
22
+ class Criterion(BaseModel):
23
+ """A single grading criterion registered via ``Evaluator.criterion``."""
24
+
25
+ name: str
26
+ weight: float = 1.0
27
+ fatal: bool = False
28
+ func: Callable[[Path], Any]
29
+
30
+ model_config = {"arbitrary_types_allowed": True}
31
+
32
+
33
+ class ToolCall(BaseModel):
34
+ """A single tool invocation within an agent step."""
35
+
36
+ tool_call_id: str
37
+ function_name: str
38
+ arguments: dict[str, Any] = Field(default_factory=dict)
39
+
40
+ model_config = {"extra": "ignore"}
41
+
42
+ def arg(self, key: str, default: Any = None) -> Any:
43
+ """Convenience accessor for a single argument value."""
44
+ return self.arguments.get(key, default)
45
+
46
+
47
+ class Step(BaseModel):
48
+ """One turn in the agent trajectory."""
49
+
50
+ step_id: int
51
+ source: str # "user" | "agent" | "system"
52
+ message: Any = "" # str or list[ContentPart] — we accept either
53
+ tool_calls: list[ToolCall] | None = None
54
+
55
+ model_config = {"extra": "ignore"}
56
+
57
+
58
+ class Trajectory(BaseModel):
59
+ """Parsed ATIF trajectory. Exposes a small query API for verifier use."""
60
+
61
+ schema_version: str = ""
62
+ session_id: str | None = None
63
+ steps: list[Step] = Field(default_factory=list)
64
+
65
+ model_config = {"extra": "ignore"}
66
+
67
+ # ------------------------------------------------------------------
68
+ # Query primitives
69
+ # ------------------------------------------------------------------
70
+
71
+ def all_tool_calls(self) -> list[ToolCall]:
72
+ """Flat list of every tool call made across all agent steps."""
73
+ return [
74
+ tc
75
+ for step in self.steps
76
+ if step.tool_calls
77
+ for tc in step.tool_calls
78
+ ]
79
+
80
+ def tool_calls_for(self, function_name: str) -> list[ToolCall]:
81
+ """All tool calls whose ``function_name`` matches exactly.
82
+
83
+ Equivalent to ``find_all(function_name)``.
84
+ """
85
+ return self.find_all(function_name)
86
+
87
+ def exists(
88
+ self,
89
+ function_name: str,
90
+ predicate: Callable[[ToolCall], bool] | None = None,
91
+ ) -> bool:
92
+ """Return ``True`` if any tool call matches ``function_name`` and,
93
+ optionally, satisfies ``predicate``.
94
+
95
+ Examples::
96
+
97
+ trajectory.exists("write_file")
98
+ trajectory.exists(
99
+ "write_file",
100
+ lambda tc: PurePosixPath(tc.arg("path", "")).name == "blog_post.md",
101
+ )
102
+ trajectory.exists("terminal", lambda tc: "pytest" in tc.arg("command", ""))
103
+ """
104
+ return self.find(function_name, predicate) is not None
105
+
106
+ def find(
107
+ self,
108
+ function_name: str,
109
+ predicate: Callable[[ToolCall], bool] | None = None,
110
+ ) -> ToolCall | None:
111
+ """Return the first :class:`ToolCall` matching ``function_name`` (and
112
+ ``predicate`` if given), or ``None`` if no match is found.
113
+
114
+ Example::
115
+
116
+ tc = trajectory.find("write_file")
117
+ if tc:
118
+ print(tc.arg("path"))
119
+ """
120
+ for tc in self.all_tool_calls():
121
+ if tc.function_name != function_name:
122
+ continue
123
+ if predicate is None or predicate(tc):
124
+ return tc
125
+ return None
126
+
127
+ def find_all(
128
+ self,
129
+ function_name: str,
130
+ predicate: Callable[[ToolCall], bool] | None = None,
131
+ ) -> list[ToolCall]:
132
+ """Return all :class:`ToolCall` objects matching ``function_name`` (and
133
+ ``predicate`` if given).
134
+
135
+ Example::
136
+
137
+ calls = trajectory.find_all("write_file")
138
+ paths = [tc.arg("path") for tc in calls]
139
+ """
140
+ return [
141
+ tc
142
+ for tc in self.all_tool_calls()
143
+ if tc.function_name == function_name
144
+ and (predicate is None or predicate(tc))
145
+ ]
@@ -0,0 +1,133 @@
1
+ Metadata-Version: 2.4
2
+ Name: graded
3
+ Version: 1.0.0
4
+ Summary: Defensive verifier framework and helpers for Harbor evaluations
5
+ Classifier: Programming Language :: Python :: 3
6
+ Classifier: Operating System :: OS Independent
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: pydantic>=2.0
10
+ Requires-Dist: instructor>=1.0.0
11
+ Requires-Dist: jsonref>=1.1.0
12
+ Requires-Dist: google-genai>=1.47.0
13
+
14
+ # graded 🍳
15
+
16
+ `graded` is a defensive verifier and grading framework designed for agent evaluations (particularly for Harbor agent evaluations). It allows you to declare structured grading criteria, leverage LLM judges with automatic tracing, and safely manage evaluation artifacts.
17
+
18
+ ---
19
+
20
+ ## Installation
21
+
22
+ Install `graded` directly from PyPI (or your internal registry):
23
+
24
+ ```bash
25
+ pip install graded
26
+ ```
27
+
28
+ Or with `uv`:
29
+
30
+ ```bash
31
+ uv pip install graded
32
+ ```
33
+
34
+ ---
35
+
36
+ ## Quick Start
37
+
38
+ Create an evaluation script (e.g. `verify.py`) to grade a task workspace:
39
+
40
+ ```python
41
+ from pathlib import Path
42
+ from graded import Evaluator
43
+
44
+ # Initialize the evaluator
45
+ ev = Evaluator(
46
+ workspace="/workspace",
47
+ output_path="/logs/verifier/reward.json",
48
+ auto_save_artifacts=True
49
+ )
50
+
51
+ # 1. Declare a standard criterion
52
+ @ev.criterion(name="has_output_file", weight=1.0)
53
+ def check_output(workspace: Path) -> bool:
54
+ return (workspace / "output.txt").is_file()
55
+
56
+ # 2. Declare a fatal criterion (short-circuits score to 0.0 if failed)
57
+ @ev.criterion(name="no_syntax_errors", weight=2.0, fatal=True)
58
+ def check_syntax(workspace: Path) -> bool:
59
+ # return True or False (or float 0.0 - 1.0)
60
+ return True
61
+
62
+ # 3. Declare a fractional scoring criterion
63
+ @ev.criterion(name="test_pass_rate", weight=3.0)
64
+ def check_tests(workspace: Path) -> float:
65
+ # Returns a score between 0.0 and 1.0
66
+ return 0.8 # e.g., 80% of tests passed
67
+
68
+ # Run the evaluation and write outputs
69
+ if __name__ == "__main__":
70
+ ev.run()
71
+ ```
72
+
73
+ ---
74
+
75
+ ## Core Features
76
+
77
+ ### 1. Criteria Declarations (`@ev.criterion`)
78
+ Define check functions using the `@ev.criterion` decorator.
79
+ - **`name`**: The unique identifier for the criterion.
80
+ - **`weight`**: Relative weight of the score in the final weighted average calculation.
81
+ - **`fatal`**: If set to `True`, any score of `0.0` or `False` immediately short-circuits the final score to `0.0`.
82
+ - **Return Value**: Must return a `bool`, `int`, or `float`. Anything else raises a `ValueError`.
83
+
84
+ ### 2. LLM Judge with Automatic Tracing
85
+ `graded` integrates with `instructor` to run structured, schema-validated LLM grading prompts, automatically logging prompt, parameters, response schema, and LLM responses to `traces.json`.
86
+
87
+ ```python
88
+ from pydantic import BaseModel, Field
89
+
90
+ class Rubric(BaseModel):
91
+ score: float = Field(description="Score between 0.0 and 1.0 based on correctness.")
92
+ reasoning: str = Field(description="Detailed reasoning for the score.")
93
+
94
+ # In your criterion:
95
+ result = ev.llm_judge(
96
+ model="google/gemini-3.5-flash",
97
+ response_model=Rubric,
98
+ system="You are a strict code correctness evaluator.",
99
+ prompt="Compare the student's solution in code.py with the requirements...",
100
+ )
101
+
102
+ print(f"LLM Score: {result.score}")
103
+ print(f"Reasoning: {result.reasoning}")
104
+ ```
105
+
106
+ ### 3. File & Artifact Management
107
+ Safely access files and copy evaluation artifacts to the logs directory for post-evaluation review.
108
+
109
+ - **`ev.read_file(filename)`**: Safely reads content as a string. Auto-saves a copy to artifacts.
110
+ - **`ev.load_json(filename)`**: Safely parses JSON file content. Auto-saves a copy to artifacts.
111
+ - **`ev.save_file(filename, content)`**: Save arbitrary text/data to the artifacts directory.
112
+ - **`ev.save_dir(dirname)`**: Copy an entire directory from the workspace to the artifacts directory.
113
+ - **`ev.load_trajectory(path)`**: Load and parse an agent's ATIF `trajectory.json` file into a typed `Trajectory` object.
114
+
115
+ ---
116
+
117
+ ## Outputs
118
+
119
+ When `ev.run()` completes, the following files are written to the directory containing your configured `output_path`:
120
+
121
+ 1. **`reward.json`**: A flat JSON dictionary containing the final calculated `reward` and the individual scores for each criterion:
122
+ ```json
123
+ {
124
+ "reward": 0.75,
125
+ "has_output_file": 1.0,
126
+ "no_syntax_errors": 1.0,
127
+ "test_pass_rate": 0.8
128
+ }
129
+ ```
130
+ 2. **`reward.txt`**: A text file containing just the final reward float value (e.g. `0.7500\n`).
131
+ 3. **`traces.json`**: A list of structured LLM calls made via `ev.llm_judge`, detailing inputs, responses, latencies, and metadata.
132
+ 4. **`metadata.json`**: (Optional) Contains evaluator-level and run-level metadata.
133
+ 5. **`artifacts/`**: Subfolder containing copy-back files preserved during the evaluation run.
@@ -0,0 +1,14 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/graded/__init__.py
4
+ src/graded/evaluator.py
5
+ src/graded/types.py
6
+ src/graded.egg-info/PKG-INFO
7
+ src/graded.egg-info/SOURCES.txt
8
+ src/graded.egg-info/dependency_links.txt
9
+ src/graded.egg-info/requires.txt
10
+ src/graded.egg-info/top_level.txt
11
+ tests/test_evaluator_artifacts.py
12
+ tests/test_evaluator_io.py
13
+ tests/test_evaluator_llm.py
14
+ tests/test_evaluator_scoring.py
@@ -0,0 +1,4 @@
1
+ pydantic>=2.0
2
+ instructor>=1.0.0
3
+ jsonref>=1.1.0
4
+ google-genai>=1.47.0
@@ -0,0 +1 @@
1
+ graded
@@ -0,0 +1,131 @@
1
+ import json
2
+ import os
3
+ from graded import Evaluator
4
+
5
+
6
+ def test_save_file(workspace_setup):
7
+ ev = workspace_setup["evaluator"]
8
+ output_path = workspace_setup["output_path"]
9
+
10
+ ev.save_file("captured.md", "# Hello World")
11
+
12
+ dest = output_path.parent / "artifacts" / "captured.md"
13
+ assert dest.exists()
14
+ assert dest.read_text() == "# Hello World"
15
+
16
+
17
+ def test_save_file_nested(workspace_setup):
18
+ ev = workspace_setup["evaluator"]
19
+ output_path = workspace_setup["output_path"]
20
+
21
+ ev.save_file("sub/dir/deep.txt", "nested content")
22
+
23
+ dest = output_path.parent / "artifacts" / "sub" / "dir" / "deep.txt"
24
+ assert dest.exists()
25
+ assert dest.read_text() == "nested content"
26
+
27
+
28
+ def test_save_dir(workspace_setup):
29
+ ws = workspace_setup["workspace"]
30
+ ev = workspace_setup["evaluator"]
31
+ output_path = workspace_setup["output_path"]
32
+
33
+ # Create a directory with files in the workspace
34
+ (ws / "mydir").mkdir()
35
+ (ws / "mydir" / "a.txt").write_text("aaa")
36
+ (ws / "mydir" / "b.txt").write_text("bbb")
37
+
38
+ ev.save_dir("mydir")
39
+
40
+ artifacts_dir = output_path.parent / "artifacts" / "mydir"
41
+ assert artifacts_dir.is_dir()
42
+ assert (artifacts_dir / "a.txt").read_text() == "aaa"
43
+ assert (artifacts_dir / "b.txt").read_text() == "bbb"
44
+
45
+
46
+ def test_save_dir_missing(workspace_setup):
47
+ ev = workspace_setup["evaluator"]
48
+ output_path = workspace_setup["output_path"]
49
+
50
+ # Should not raise, just log a warning
51
+ ev.save_dir("nonexistent")
52
+
53
+ artifacts_dir = output_path.parent / "artifacts" / "nonexistent"
54
+ assert not artifacts_dir.exists()
55
+
56
+
57
+ def test_auto_capture_read_file(tmp_path):
58
+ ws = tmp_path / "workspace"
59
+ ws.mkdir()
60
+ output_path = tmp_path / "logs" / "reward.json"
61
+
62
+ ev = Evaluator(workspace=ws, output_path=output_path, auto_save_artifacts=True)
63
+
64
+ (ws / "doc.md").write_text("auto captured content")
65
+ result = ev.read_file("doc.md")
66
+
67
+ assert result == "auto captured content"
68
+ dest = output_path.parent / "artifacts" / "doc.md"
69
+ assert dest.exists()
70
+ assert dest.read_text() == "auto captured content"
71
+
72
+
73
+ def test_auto_capture_load_json(tmp_path):
74
+ ws = tmp_path / "workspace"
75
+ ws.mkdir()
76
+ output_path = tmp_path / "logs" / "reward.json"
77
+
78
+ ev = Evaluator(workspace=ws, output_path=output_path, auto_save_artifacts=True)
79
+
80
+ (ws / "data.json").write_text(json.dumps({"key": "value"}))
81
+ result = ev.load_json("data.json")
82
+
83
+ assert result == {"key": "value"}
84
+ dest = output_path.parent / "artifacts" / "data.json"
85
+ assert dest.exists()
86
+ assert json.loads(dest.read_text()) == {"key": "value"}
87
+
88
+
89
+ def test_auto_capture_disabled_globally(tmp_path):
90
+ ws = tmp_path / "workspace"
91
+ ws.mkdir()
92
+ output_path = tmp_path / "logs" / "reward.json"
93
+
94
+ ev = Evaluator(workspace=ws, output_path=output_path, auto_save_artifacts=False)
95
+
96
+ (ws / "doc.md").write_text("should not be captured")
97
+ ev.read_file("doc.md")
98
+
99
+ dest = output_path.parent / "artifacts" / "doc.md"
100
+ assert not dest.exists()
101
+
102
+
103
+ def test_per_call_override_save(tmp_path):
104
+ ws = tmp_path / "workspace"
105
+ ws.mkdir()
106
+ output_path = tmp_path / "logs" / "reward.json"
107
+
108
+ # Auto-save OFF globally, but override ON per-call
109
+ ev = Evaluator(workspace=ws, output_path=output_path, auto_save_artifacts=False)
110
+
111
+ (ws / "important.md").write_text("save me")
112
+ ev.read_file("important.md", save_artifact=True)
113
+
114
+ dest = output_path.parent / "artifacts" / "important.md"
115
+ assert dest.exists()
116
+ assert dest.read_text() == "save me"
117
+
118
+
119
+ def test_per_call_override_skip(tmp_path):
120
+ ws = tmp_path / "workspace"
121
+ ws.mkdir()
122
+ output_path = tmp_path / "logs" / "reward.json"
123
+
124
+ # Auto-save ON globally, but override OFF per-call
125
+ ev = Evaluator(workspace=ws, output_path=output_path, auto_save_artifacts=True)
126
+
127
+ (ws / "config.yaml").write_text("skip me")
128
+ ev.read_file("config.yaml", save_artifact=False)
129
+
130
+ dest = output_path.parent / "artifacts" / "config.yaml"
131
+ assert not dest.exists()
@@ -0,0 +1,58 @@
1
+ import json
2
+
3
+ def test_load_json_success(workspace_setup):
4
+ ws = workspace_setup["workspace"]
5
+ ev = workspace_setup["evaluator"]
6
+
7
+ (ws / "valid.json").write_text(json.dumps({"a": 1}))
8
+
9
+ data = ev.load_json("valid.json")
10
+ assert data == {"a": 1}
11
+
12
+ def test_load_json_missing(workspace_setup):
13
+ ev = workspace_setup["evaluator"]
14
+ assert ev.load_json("missing.json") is None
15
+
16
+ def test_load_json_invalid(workspace_setup):
17
+ ws = workspace_setup["workspace"]
18
+ ev = workspace_setup["evaluator"]
19
+
20
+ (ws / "invalid.json").write_text("{invalid")
21
+
22
+ assert ev.load_json("invalid.json") is None
23
+
24
+ def test_read_file_success(workspace_setup):
25
+ ws = workspace_setup["workspace"]
26
+ ev = workspace_setup["evaluator"]
27
+
28
+ (ws / "doc.txt").write_text("hello world")
29
+
30
+ assert ev.read_file("doc.txt") == "hello world"
31
+
32
+ def test_read_file_missing(workspace_setup):
33
+ ev = workspace_setup["evaluator"]
34
+ assert ev.read_file("missing.txt") is None
35
+
36
+ def test_file_exists(workspace_setup):
37
+ ws = workspace_setup["workspace"]
38
+ ev = workspace_setup["evaluator"]
39
+
40
+ assert not ev.file_exists("foo.txt")
41
+ (ws / "foo.txt").write_text("hello")
42
+ assert ev.file_exists("foo.txt")
43
+
44
+ # directories are not files
45
+ (ws / "bar").mkdir()
46
+ assert not ev.file_exists("bar")
47
+
48
+ def test_dir_exists(workspace_setup):
49
+ ws = workspace_setup["workspace"]
50
+ ev = workspace_setup["evaluator"]
51
+
52
+ assert not ev.dir_exists("bar")
53
+ (ws / "bar").mkdir()
54
+ assert ev.dir_exists("bar")
55
+
56
+ # files are not directories
57
+ (ws / "foo.txt").write_text("hello")
58
+ assert not ev.dir_exists("foo.txt")
@@ -0,0 +1,64 @@
1
+ import os
2
+ import pytest
3
+ from pydantic import BaseModel, Field
4
+ from conftest import DummyRubric
5
+
6
+
7
+ def test_llm_judge_real_success(workspace_setup):
8
+ if not os.environ.get("GEMINI_API_KEY"):
9
+ pytest.skip("GEMINI_API_KEY environment variable not set")
10
+
11
+ ev = workspace_setup["evaluator"]
12
+
13
+ result = ev.llm_judge(
14
+ model="google/gemini-3.5-flash",
15
+ response_model=DummyRubric,
16
+ system="You are a strict helper grader.",
17
+ prompt="Please grade the politeness of this string: 'Hello, could you please help me with my task?'",
18
+ )
19
+
20
+ assert isinstance(result, DummyRubric)
21
+ assert 0.0 <= result.score <= 1.0
22
+ assert len(result.reasoning) > 0
23
+
24
+ # Verify traces are stored
25
+ assert len(ev.traces) == 1
26
+ assert ev.traces[0] == {
27
+ "model": "google/gemini-3.5-flash",
28
+ "system": "You are a strict helper grader.",
29
+ "prompt": "Please grade the politeness of this string: 'Hello, could you please help me with my task?'",
30
+ "kwargs": {},
31
+ "response_model_schema": DummyRubric.model_json_schema(),
32
+ "status": "success",
33
+ "response": {"score": result.score, "reasoning": result.reasoning},
34
+ "metadata": {},
35
+ }
36
+
37
+
38
+ def test_llm_judge_real_failure(workspace_setup):
39
+ if not os.environ.get("GEMINI_API_KEY"):
40
+ pytest.skip("GEMINI_API_KEY environment variable not set")
41
+
42
+ ev = workspace_setup["evaluator"]
43
+
44
+ # Use an invalid model name to force a real API / SDK validation error
45
+ with pytest.raises(Exception) as exc_info:
46
+ ev.llm_judge(
47
+ model="google/invalid-model-name-does-not-exist",
48
+ response_model=DummyRubric,
49
+ system="be strict",
50
+ prompt="evaluate guide",
51
+ )
52
+
53
+ # Verify traces recorded failure
54
+ assert len(ev.traces) == 1
55
+ assert ev.traces[0] == {
56
+ "model": "google/invalid-model-name-does-not-exist",
57
+ "system": "be strict",
58
+ "prompt": "evaluate guide",
59
+ "kwargs": {},
60
+ "response_model_schema": DummyRubric.model_json_schema(),
61
+ "status": "failed",
62
+ "error": str(exc_info.value),
63
+ "metadata": {},
64
+ }
@@ -0,0 +1,152 @@
1
+ import json
2
+ import pytest
3
+
4
+ def test_criterion_duplicate_name_rejected(workspace_setup):
5
+ ev = workspace_setup["evaluator"]
6
+
7
+ @ev.criterion("same_name")
8
+ def check_a(ws):
9
+ return True
10
+
11
+ with pytest.raises(ValueError, match="Duplicate criterion name: 'same_name'"):
12
+ @ev.criterion("same_name")
13
+ def check_b(ws):
14
+ return True
15
+
16
+ def test_criterion_registration(workspace_setup):
17
+ ev = workspace_setup["evaluator"]
18
+
19
+ @ev.criterion("check_1", weight=2.5)
20
+ def my_check(ws):
21
+ return True
22
+
23
+ assert len(ev.criteria) == 1
24
+ assert ev.criteria[0].name == "check_1"
25
+ assert ev.criteria[0].weight == 2.5
26
+ assert ev.criteria[0].func == my_check
27
+ assert ev.criteria[0].fatal == False
28
+
29
+ def test_run_weighted_scoring(workspace_setup):
30
+ ev = workspace_setup["evaluator"]
31
+ output_path = workspace_setup["output_path"]
32
+
33
+ @ev.criterion("check_true", weight=3.0)
34
+ def check_true(ws):
35
+ return True
36
+
37
+ @ev.criterion("check_false", weight=1.0)
38
+ def check_false(ws):
39
+ return False
40
+
41
+ @ev.criterion("check_float", weight=2.0)
42
+ def check_float(ws):
43
+ return 0.5
44
+
45
+ ev.run()
46
+
47
+ # Total weight = 3.0 + 1.0 + 2.0 = 6.0
48
+ # Weighted score = 1.0 * 3.0 + 0.0 * 1.0 + 0.5 * 2.0 = 4.0
49
+ # Expected reward = 4.0 / 6.0 = 0.6667
50
+
51
+ assert output_path.exists()
52
+ reward_data = json.loads(output_path.read_text())
53
+ reward_data["reward"] = round(reward_data["reward"], 4)
54
+ assert reward_data == {
55
+ "reward": 0.6667,
56
+ "check_true": 1.0,
57
+ "check_false": 0.0,
58
+ "check_float": 0.5
59
+ }
60
+
61
+ def test_run_handles_exceptions(workspace_setup):
62
+ ev = workspace_setup["evaluator"]
63
+ output_path = workspace_setup["output_path"]
64
+
65
+ @ev.criterion("check_pass", weight=1.0)
66
+ def check_pass(ws):
67
+ return True
68
+
69
+ @ev.criterion("check_crash", weight=1.0)
70
+ def check_crash(ws):
71
+ raise ValueError("Simulated crash")
72
+
73
+ ev.run()
74
+
75
+ # Total weight = 2.0
76
+ # Weighted score = 1.0 * 1.0 + 0.0 * 1.0 = 1.0
77
+ # Expected reward = 1.0 / 2.0 = 0.5
78
+
79
+ reward_data = json.loads(output_path.read_text())
80
+ assert reward_data == {
81
+ "reward": 0.5,
82
+ "check_pass": 1.0,
83
+ "check_crash": 0.0
84
+ }
85
+
86
+ def test_fatal_criterion_fails(workspace_setup):
87
+ ev = workspace_setup["evaluator"]
88
+ output_path = workspace_setup["output_path"]
89
+
90
+ @ev.criterion("file_check", weight=0.10, fatal=True)
91
+ def check_file(ws):
92
+ return False
93
+
94
+ @ev.criterion("content_check", weight=0.90)
95
+ def check_content(ws):
96
+ return 1.0
97
+
98
+ ev.run()
99
+
100
+ reward_data = json.loads(output_path.read_text())
101
+ # Fatal criterion failed -> reward is 0.0, content_check never ran
102
+ assert reward_data["reward"] == 0.0
103
+ assert "content_check" not in reward_data
104
+
105
+ def test_criterion_invalid_return_type_raises(workspace_setup):
106
+ ev = workspace_setup["evaluator"]
107
+
108
+ @ev.criterion("forgot_return")
109
+ def forgot_return(ws):
110
+ pass # returns None
111
+
112
+ with pytest.raises(ValueError, match="must return bool | int | float"):
113
+ ev.run()
114
+
115
+
116
+ def test_criterion_crash_still_scores_zero(workspace_setup):
117
+ ev = workspace_setup["evaluator"]
118
+ output_path = workspace_setup["output_path"]
119
+
120
+ @ev.criterion("ok", weight=1.0)
121
+ def ok(ws):
122
+ return True
123
+
124
+ @ev.criterion("boom", weight=1.0)
125
+ def boom(ws):
126
+ raise RuntimeError("kaboom")
127
+
128
+ # A genuine crash is caught and scored 0.0 (not raised), unlike a bad return type.
129
+ ev.run()
130
+ reward_data = json.loads(output_path.read_text())
131
+ assert reward_data == {"reward": 0.5, "ok": 1.0, "boom": 0.0}
132
+
133
+
134
+ def test_fatal_criterion_passes(workspace_setup):
135
+ ev = workspace_setup["evaluator"]
136
+ output_path = workspace_setup["output_path"]
137
+
138
+ @ev.criterion("file_check", weight=1.0, fatal=True)
139
+ def check_file(ws):
140
+ return True
141
+
142
+ @ev.criterion("content_check", weight=1.0)
143
+ def check_content(ws):
144
+ return 0.8
145
+
146
+ ev.run()
147
+
148
+ reward_data = json.loads(output_path.read_text())
149
+ # Fatal criterion passed -> normal scoring continues
150
+ # (1.0 * 1.0 + 0.8 * 1.0) / 2.0 = 0.9
151
+ assert reward_data["reward"] == 0.9
152
+ assert reward_data == {"reward": 0.9, "file_check": 1.0, "content_check": 0.8}