harbor-rewardkit 0.1.dev1__tar.gz → 0.1.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/PKG-INFO +1 -1
  2. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/pyproject.toml +1 -1
  3. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/csv_cell_equals.py +9 -1
  4. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/diff_ratio.py +5 -0
  5. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_contains.py +5 -0
  6. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_contains_regex.py +5 -0
  7. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_matches.py +5 -0
  8. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/files_equal.py +5 -0
  9. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/image_similarity.py +7 -4
  10. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/image_size_equals.py +7 -2
  11. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/json_key_equals.py +8 -1
  12. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/json_path_equals.py +11 -1
  13. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/sqlite_query_equals.py +5 -0
  14. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/xlsx_cell_equals.py +13 -3
  15. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/isolation.py +1 -1
  16. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/judges.py +141 -71
  17. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/models.py +10 -0
  18. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/reward.py +34 -10
  19. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/runner.py +27 -3
  20. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/README.md +0 -0
  21. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/__init__.py +0 -0
  22. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/__main__.py +0 -0
  23. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/compare.py +0 -0
  24. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/__init__.py +0 -0
  25. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/_command.py +0 -0
  26. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/_trajectory.py +0 -0
  27. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_output_contains.py +0 -0
  28. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_output_matches.py +0 -0
  29. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_output_matches_regex.py +0 -0
  30. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_succeeds.py +0 -0
  31. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_exists.py +0 -0
  32. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_not_exists.py +0 -0
  33. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/http_response_contains.py +0 -0
  34. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/http_status_equals.py +0 -0
  35. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/trajectory_tool_not_used.py +0 -0
  36. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/trajectory_tool_used.py +0 -0
  37. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/trajectory_turn_count.py +0 -0
  38. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/prompts/agent.md +0 -0
  39. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/prompts/llm.md +0 -0
  40. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/prompts/llm_trajectory.md +0 -0
  41. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/session.py +0 -0
  42. {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/trajectory.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: harbor-rewardkit
3
- Version: 0.1.dev1
3
+ Version: 0.1.dev4
4
4
  Summary: Lightweight grading toolkit for environment-based tasks.
5
5
  Keywords: grading,evaluation,rewards,llm,agents,benchmarks
6
6
  Author: benediktstroebl
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "harbor-rewardkit"
3
- version = "0.1.dev1"
3
+ version = "0.1.dev4"
4
4
  description = "Lightweight grading toolkit for environment-based tasks."
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -1,6 +1,7 @@
1
1
  """Criterion: check that a specific CSV cell has the expected value."""
2
2
 
3
3
  import csv
4
+ import warnings
4
5
  from pathlib import Path
5
6
 
6
7
  from rewardkit.session import criterion
@@ -29,5 +30,12 @@ def csv_cell_equals(
29
30
  return r[col] == expected
30
31
  return False
31
32
  return False
32
- except (FileNotFoundError, OSError, csv.Error, KeyError, IndexError):
33
+ except (FileNotFoundError, OSError) as e:
34
+ if isinstance(e, FileNotFoundError):
35
+ warnings.warn(
36
+ f"csv_cell_equals: '{path}' not found in workspace, assigning reward 0",
37
+ stacklevel=2,
38
+ )
39
+ return False
40
+ except (csv.Error, KeyError, IndexError):
33
41
  return False
@@ -1,5 +1,6 @@
1
1
  """Criterion: fuzzy text comparison returning a similarity ratio [0, 1]."""
2
2
 
3
+ import warnings
3
4
  from difflib import SequenceMatcher
4
5
  from pathlib import Path
5
6
 
@@ -11,5 +12,9 @@ def diff_ratio(workspace: Path, path: str, expected: str) -> float:
11
12
  try:
12
13
  content = (workspace / path).read_text()
13
14
  except (FileNotFoundError, OSError):
15
+ warnings.warn(
16
+ f"diff_ratio: '{path}' not found in workspace, assigning reward 0",
17
+ stacklevel=2,
18
+ )
14
19
  return 0.0
15
20
  return SequenceMatcher(None, content.strip(), expected.strip()).ratio()
@@ -1,5 +1,6 @@
1
1
  """Criterion: check that a file contains a given string."""
2
2
 
3
+ import warnings
3
4
  from pathlib import Path
4
5
 
5
6
  from rewardkit.session import criterion
@@ -10,4 +11,8 @@ def file_contains(workspace: Path, path: str, text: str) -> bool:
10
11
  try:
11
12
  return text in (workspace / path).read_text()
12
13
  except (FileNotFoundError, OSError):
14
+ warnings.warn(
15
+ f"file_contains: '{path}' not found in workspace, assigning reward 0",
16
+ stacklevel=2,
17
+ )
13
18
  return False
@@ -1,6 +1,7 @@
1
1
  """Criterion: check that a file contains text matching a regex pattern."""
2
2
 
3
3
  import re
4
+ import warnings
4
5
  from pathlib import Path
5
6
 
6
7
  from rewardkit.session import criterion
@@ -11,4 +12,8 @@ def file_contains_regex(workspace: Path, path: str, pattern: str) -> bool:
11
12
  try:
12
13
  return re.search(pattern, (workspace / path).read_text()) is not None
13
14
  except (FileNotFoundError, OSError):
15
+ warnings.warn(
16
+ f"file_contains_regex: '{path}' not found in workspace, assigning reward 0",
17
+ stacklevel=2,
18
+ )
14
19
  return False
@@ -1,5 +1,6 @@
1
1
  """Criterion: check that a file's content matches an expected string."""
2
2
 
3
+ import warnings
3
4
  from pathlib import Path
4
5
 
5
6
  from rewardkit.session import criterion
@@ -12,4 +13,8 @@ def file_matches(workspace: Path, path: str, expected: str) -> bool:
12
13
  try:
13
14
  return (workspace / path).read_text().strip() == expected.strip()
14
15
  except (FileNotFoundError, OSError):
16
+ warnings.warn(
17
+ f"file_matches: '{path}' not found in workspace, assigning reward 0",
18
+ stacklevel=2,
19
+ )
15
20
  return False
@@ -1,5 +1,6 @@
1
1
  """Criterion: check that two workspace files have equal content."""
2
2
 
3
+ import warnings
3
4
  from pathlib import Path
4
5
 
5
6
  from rewardkit.session import criterion
@@ -12,4 +13,8 @@ def files_equal(workspace: Path, path1: str, path2: str) -> bool:
12
13
  workspace / path2
13
14
  ).read_text().strip()
14
15
  except (FileNotFoundError, OSError):
16
+ warnings.warn(
17
+ f"files_equal: '{path1}' or '{path2}' not found in workspace, assigning reward 0",
18
+ stacklevel=2,
19
+ )
15
20
  return False
@@ -1,5 +1,6 @@
1
1
  """Criterion: compare two images pixel-by-pixel and return similarity ratio."""
2
2
 
3
+ import warnings
3
4
  from pathlib import Path
4
5
 
5
6
  from rewardkit.session import criterion
@@ -12,7 +13,7 @@ def image_similarity(workspace: Path, path1: str, path2: str) -> float:
12
13
  except ImportError:
13
14
  raise ImportError(
14
15
  "image_similarity requires Pillow. "
15
- "Install with: pip install rewardkit[image]"
16
+ "Install with: uv add harbor-rewardkit[image]"
16
17
  )
17
18
  try:
18
19
  with (
@@ -25,8 +26,6 @@ def image_similarity(workspace: Path, path1: str, path2: str) -> float:
25
26
  rgba2 = img2.convert("RGBA")
26
27
  diff = ImageChops.difference(rgba1, rgba2)
27
28
  pixel_count = rgba1.size[0] * rgba1.size[1]
28
- # A pixel matches iff all 4 RGBA channels have zero difference.
29
- # Use per-channel point() to build a mask in C rather than Python loops.
30
29
  r, g, b, a = diff.split()
31
30
 
32
31
  def zero_fn(x: int) -> int:
@@ -38,5 +37,9 @@ def image_similarity(workspace: Path, path1: str, path2: str) -> float:
38
37
  )
39
38
  matching = mask.tobytes().count(b"\xff")
40
39
  return matching / pixel_count
41
- except Exception:
40
+ except (FileNotFoundError, OSError):
41
+ warnings.warn(
42
+ f"image_similarity: '{path1}' or '{path2}' not found in workspace, assigning reward 0",
43
+ stacklevel=2,
44
+ )
42
45
  return 0.0
@@ -1,5 +1,6 @@
1
1
  """Criterion: check that an image has the expected dimensions."""
2
2
 
3
+ import warnings
3
4
  from pathlib import Path
4
5
 
5
6
  from rewardkit.session import criterion
@@ -12,10 +13,14 @@ def image_size_equals(workspace: Path, path: str, width: int, height: int) -> bo
12
13
  except ImportError:
13
14
  raise ImportError(
14
15
  "image_size_equals requires Pillow. "
15
- "Install with: pip install rewardkit[image]"
16
+ "Install with: uv add harbor-rewardkit[image]"
16
17
  )
17
18
  try:
18
19
  with Image.open(workspace / path) as img:
19
20
  return img.size == (width, height)
20
- except Exception:
21
+ except (FileNotFoundError, OSError):
22
+ warnings.warn(
23
+ f"image_size_equals: '{path}' not found in workspace, assigning reward 0",
24
+ stacklevel=2,
25
+ )
21
26
  return False
@@ -1,6 +1,7 @@
1
1
  """Criterion: check that a JSON file has a specific key-value pair."""
2
2
 
3
3
  import json as _json
4
+ import warnings
4
5
  from pathlib import Path
5
6
 
6
7
  from rewardkit.session import criterion
@@ -13,5 +14,11 @@ def json_key_equals(workspace: Path, path: str, key: str, expected: object) -> b
13
14
  if not isinstance(data, dict):
14
15
  return False
15
16
  return data.get(key) == expected
16
- except (FileNotFoundError, OSError, ValueError, KeyError):
17
+ except (FileNotFoundError, OSError):
18
+ warnings.warn(
19
+ f"json_key_equals: '{path}' not found in workspace, assigning reward 0",
20
+ stacklevel=2,
21
+ )
22
+ return False
23
+ except (ValueError, KeyError):
17
24
  return False
@@ -1,6 +1,7 @@
1
1
  """Criterion: check a nested JSON path against an expected value."""
2
2
 
3
3
  import json as _json
4
+ import warnings
4
5
  from pathlib import Path
5
6
 
6
7
  from rewardkit.session import criterion
@@ -12,6 +13,15 @@ def json_path_equals(
12
13
  ) -> bool:
13
14
  try:
14
15
  data = _json.loads((workspace / path).read_text())
16
+ except (FileNotFoundError, OSError):
17
+ warnings.warn(
18
+ f"json_path_equals: '{path}' not found in workspace, assigning reward 0",
19
+ stacklevel=2,
20
+ )
21
+ return False
22
+ except ValueError:
23
+ return False
24
+ try:
15
25
  for segment in json_path.split("."):
16
26
  if isinstance(data, dict):
17
27
  data = data[segment]
@@ -20,5 +30,5 @@ def json_path_equals(
20
30
  else:
21
31
  return False
22
32
  return data == expected
23
- except (FileNotFoundError, OSError, ValueError, KeyError, IndexError, TypeError):
33
+ except (KeyError, IndexError, TypeError, ValueError):
24
34
  return False
@@ -1,6 +1,7 @@
1
1
  """Criterion: run a SQL query against a SQLite database and check the result."""
2
2
 
3
3
  import sqlite3
4
+ import warnings
4
5
  from pathlib import Path
5
6
 
6
7
  from rewardkit.session import criterion
@@ -12,6 +13,10 @@ def sqlite_query_equals(
12
13
  ) -> bool:
13
14
  target = workspace / db_path
14
15
  if not target.exists():
16
+ warnings.warn(
17
+ f"sqlite_query_equals: '{db_path}' not found in workspace, assigning reward 0",
18
+ stacklevel=2,
19
+ )
15
20
  return False
16
21
  try:
17
22
  with sqlite3.connect(str(target)) as conn:
@@ -1,5 +1,6 @@
1
1
  """Criterion: check that a specific cell in an xlsx file has the expected value."""
2
2
 
3
+ import warnings
3
4
  from pathlib import Path
4
5
 
5
6
  from rewardkit.session import criterion
@@ -14,15 +15,24 @@ def xlsx_cell_equals(
14
15
  except ImportError:
15
16
  raise ImportError(
16
17
  "xlsx_cell_equals requires openpyxl. "
17
- "Install with: pip install rewardkit[office]"
18
+ "Install with: uv add harbor-rewardkit[office]"
18
19
  )
19
20
  try:
20
21
  wb = openpyxl.load_workbook(
21
22
  str(workspace / path), read_only=True, data_only=True
22
23
  )
24
+ except (FileNotFoundError, OSError) as e:
25
+ if isinstance(e, FileNotFoundError):
26
+ warnings.warn(
27
+ f"xlsx_cell_equals: '{path}' not found in workspace, assigning reward 0",
28
+ stacklevel=2,
29
+ )
30
+ return False
31
+ try:
23
32
  ws = wb[sheet] if sheet else wb.active
24
33
  value = ws[cell].value
25
- wb.close()
26
34
  return value == expected
27
- except (FileNotFoundError, OSError, KeyError, ValueError):
35
+ except (KeyError, ValueError):
28
36
  return False
37
+ finally:
38
+ wb.close()
@@ -112,8 +112,8 @@ class _Overlay:
112
112
  def isolate(path: Path) -> Generator[Path, None, None]:
113
113
  """Yield an overlayfs view of *path*. Writes go to a tmpdir; *path* is untouched."""
114
114
  ov = _Overlay(path)
115
- ov.mount()
116
115
  try:
116
+ ov.mount()
117
117
  yield ov._merged
118
118
  finally:
119
119
  ov.cleanup()
@@ -10,6 +10,7 @@ import os
10
10
  import re
11
11
  import shutil
12
12
  import subprocess
13
+ import tempfile
13
14
  from importlib import resources
14
15
  from pathlib import Path
15
16
  from typing import Any
@@ -43,6 +44,28 @@ def _build_criteria_block(criteria: list[Criterion]) -> str:
43
44
  return "\n".join(lines)
44
45
 
45
46
 
47
+ def _build_response_schema(criteria: list[Criterion]) -> dict[str, Any]:
48
+ """Build a JSON Schema that enforces the expected judge response structure."""
49
+ props: dict[str, Any] = {}
50
+ for c in criteria:
51
+ name = c.name or "criterion"
52
+ props[name] = {
53
+ "type": "object",
54
+ "properties": {
55
+ "score": c.output_format.json_schema(),
56
+ "reasoning": {"type": "string"},
57
+ },
58
+ "required": ["score", "reasoning"],
59
+ "additionalProperties": False,
60
+ }
61
+ return {
62
+ "type": "object",
63
+ "properties": props,
64
+ "required": list(props.keys()),
65
+ "additionalProperties": False,
66
+ }
67
+
68
+
46
69
  def build_prompt(
47
70
  criteria: list[Criterion],
48
71
  template: str | None = None,
@@ -129,6 +152,9 @@ def _text_from_blocks(blocks: list[ContentBlock]) -> str:
129
152
  return "\n\n".join(b["text"] for b in blocks if b.get("type") == "text")
130
153
 
131
154
 
155
+ _MAX_JUDGE_RETRIES = 3
156
+
157
+
132
158
  def parse_judge_response(
133
159
  text: str,
134
160
  criteria: list[Criterion],
@@ -147,8 +173,13 @@ def parse_judge_response(
147
173
  scores: list[Score] = []
148
174
  for i, c in enumerate(criteria):
149
175
  cname = c.name or f"criterion_{i}"
150
- entry = data.get(cname, {})
151
- raw_score = entry.get("score", 0)
176
+ entry = data.get(cname)
177
+ if not isinstance(entry, dict) or "score" not in entry:
178
+ raise ValueError(
179
+ f"Criterion {cname!r}: expected dict with 'score' and 'reasoning', "
180
+ f"got {type(entry).__name__}: {str(entry)[:100]}"
181
+ )
182
+ raw_score = entry["score"]
152
183
  reasoning = entry.get("reasoning", "")
153
184
  value = c.output_format.normalize(raw_score)
154
185
  weight = weights[i] if weights else 1.0
@@ -165,24 +196,6 @@ def parse_judge_response(
165
196
  return scores
166
197
 
167
198
 
168
- def _fallback_scores(
169
- criteria: list[Criterion],
170
- weights: list[float] | None,
171
- error: str,
172
- ) -> list[Score]:
173
- return [
174
- Score(
175
- name=c.name or f"criterion_{i}",
176
- value=0.0,
177
- raw=False,
178
- weight=weights[i] if weights else 1.0,
179
- error=error,
180
- description=c.description,
181
- )
182
- for i, c in enumerate(criteria)
183
- ]
184
-
185
-
186
199
  async def arun_llm(
187
200
  judge: LLMJudge,
188
201
  criteria: list[Criterion],
@@ -220,6 +233,14 @@ async def arun_llm(
220
233
  )
221
234
  available_tokens = max_input_tokens - prompt_tokens - user_tokens - 32_000
222
235
 
236
+ if available_tokens <= 0:
237
+ raise ValueError(
238
+ f"Trajectory too large to include in judge prompt: "
239
+ f"no token budget remaining "
240
+ f"(prompt={prompt_tokens}, user={user_tokens}, "
241
+ f"limit={max_input_tokens})."
242
+ )
243
+
223
244
  traj_text = format_trajectory(
224
245
  judge.atif_trajectory,
225
246
  max_tokens=available_tokens,
@@ -231,23 +252,34 @@ async def arun_llm(
231
252
  messages: list[dict[str, Any]] = [{"role": "system", "content": prompt}]
232
253
  if user_blocks:
233
254
  messages.append({"role": "user", "content": user_blocks})
234
- resp = await litellm.acompletion(
235
- model=judge.model,
236
- messages=messages,
237
- response_format={"type": "json_object"},
238
- max_tokens=4096,
239
- timeout=judge.timeout,
240
- reasoning_effort=judge.reasoning_effort,
241
- )
242
- raw_output = resp.choices[0].message.content
243
- try:
244
- scores = parse_judge_response(raw_output, criteria, weights)
245
- except (ValueError, json.JSONDecodeError) as e:
246
- warn_list.append(f"Judge response parse error: {e}")
247
- scores = _fallback_scores(
248
- criteria, weights, f"Failed to parse judge response: {e}"
255
+ for attempt in range(_MAX_JUDGE_RETRIES):
256
+ resp = await litellm.acompletion(
257
+ model=judge.model,
258
+ messages=messages,
259
+ response_format={
260
+ "type": "json_schema",
261
+ "json_schema": {
262
+ "name": "judge_response",
263
+ "schema": _build_response_schema(criteria),
264
+ "strict": True,
265
+ },
266
+ },
267
+ timeout=judge.timeout,
268
+ reasoning_effort=judge.reasoning_effort,
249
269
  )
250
- return scores, raw_output, warn_list
270
+ raw_output = resp.choices[0].message.content
271
+ try:
272
+ scores = parse_judge_response(raw_output, criteria, weights)
273
+ return scores, raw_output, warn_list
274
+ except ValueError:
275
+ if attempt == _MAX_JUDGE_RETRIES - 1:
276
+ raise
277
+ logger.debug(
278
+ "Judge response did not match schema, retrying (%d/%d)",
279
+ attempt + 1,
280
+ _MAX_JUDGE_RETRIES,
281
+ )
282
+ raise RuntimeError("Unreachable")
251
283
 
252
284
 
253
285
  def _is_alpine() -> bool:
@@ -315,51 +347,89 @@ async def arun_agent(
315
347
  prompt = build_prompt(criteria, kind="agent")
316
348
  if judge.atif_trajectory:
317
349
  prompt += f"\n\nThe agent's trajectory is stored at: {judge.atif_trajectory}"
350
+ schema = _build_response_schema(criteria)
351
+ schema_path: str | None = None
318
352
  if judge.agent == "claude-code":
319
- cmd = ["claude", "-p", prompt, "--output-format", "json"]
353
+ cmd = [
354
+ "claude",
355
+ "-p",
356
+ prompt,
357
+ "--output-format",
358
+ "json",
359
+ "--json-schema",
360
+ json.dumps(schema),
361
+ ]
320
362
  cmd_name = "claude"
321
363
  else:
322
- cmd = ["codex", "exec", prompt]
364
+ fd, schema_path = tempfile.mkstemp(suffix=".json")
365
+ with os.fdopen(fd, "w") as f:
366
+ json.dump(schema, f)
367
+ cmd = ["codex", "exec", prompt, "--output-schema", schema_path]
323
368
  cmd_name = "codex"
324
369
 
325
370
  if judge.model:
371
+ model_name = judge.model
372
+ # Claude CLI uses bare model names (e.g. "claude-haiku-4-5"),
373
+ # not provider-prefixed ones (e.g. "anthropic/claude-haiku-4-5").
374
+ if judge.agent == "claude-code" and model_name.startswith("anthropic/"):
375
+ model_name = model_name.removeprefix("anthropic/")
326
376
  flag = "-m" if judge.agent == "codex" else "--model"
327
- cmd.extend([flag, judge.model])
377
+ cmd.extend([flag, model_name])
328
378
 
329
379
  _ensure_cli(cmd_name)
330
380
  cwd = judge.cwd or (
331
381
  str(workspace) if workspace and Path(workspace).is_dir() else None
332
382
  )
333
- proc = await asyncio.create_subprocess_exec(
334
- *cmd,
335
- stdout=asyncio.subprocess.PIPE,
336
- stderr=asyncio.subprocess.PIPE,
337
- cwd=cwd,
338
- )
339
- try:
340
- stdout, _stderr = await asyncio.wait_for(
341
- proc.communicate(), timeout=judge.timeout
342
- )
343
- except asyncio.TimeoutError:
344
- proc.kill()
345
- await proc.communicate()
346
- raise
347
- raw_output = stdout.decode()
348
- # Claude CLI with --output-format json wraps the actual response in a
349
- # JSON envelope with a "result" field. Extract the inner text so
350
- # parse_judge_response finds the scoring JSON, not the wrapper.
351
- if judge.agent == "claude-code":
352
- try:
353
- envelope = json.loads(raw_output)
354
- if isinstance(envelope, dict) and "result" in envelope:
355
- raw_output = envelope["result"]
356
- except (json.JSONDecodeError, TypeError):
357
- pass
383
+
358
384
  try:
359
- scores = parse_judge_response(raw_output, criteria, weights)
360
- except (ValueError, json.JSONDecodeError) as e:
361
- warn_list.append(f"Agent judge response parse error: {e}")
362
- scores = _fallback_scores(
363
- criteria, weights, f"Failed to parse agent judge response: {e}"
364
- )
365
- return scores, raw_output, warn_list
385
+ for attempt in range(_MAX_JUDGE_RETRIES):
386
+ proc = await asyncio.create_subprocess_exec(
387
+ *cmd,
388
+ stdout=asyncio.subprocess.PIPE,
389
+ stderr=asyncio.subprocess.PIPE,
390
+ cwd=cwd,
391
+ )
392
+ try:
393
+ stdout, _stderr = await asyncio.wait_for(
394
+ proc.communicate(), timeout=judge.timeout
395
+ )
396
+ except asyncio.TimeoutError:
397
+ proc.kill()
398
+ await proc.communicate()
399
+ raise
400
+ raw_output = stdout.decode()
401
+ if proc.returncode != 0:
402
+ stderr_text = _stderr.decode().strip() if _stderr else ""
403
+ raise ValueError(
404
+ f"Agent CLI '{cmd_name}' exited with code {proc.returncode}: "
405
+ f"{stderr_text or raw_output[:200]}"
406
+ )
407
+ # Claude CLI with --output-format json and --json-schema wraps the
408
+ # structured response in an envelope with a "structured_output" field.
409
+ if judge.agent == "claude-code":
410
+ try:
411
+ envelope = json.loads(raw_output)
412
+ if isinstance(envelope, dict):
413
+ if envelope.get("is_error"):
414
+ raise ValueError(
415
+ f"Claude CLI returned an error: {envelope.get('result', raw_output[:200])}"
416
+ )
417
+ if "structured_output" in envelope:
418
+ raw_output = json.dumps(envelope["structured_output"])
419
+ except (json.JSONDecodeError, TypeError):
420
+ pass
421
+ try:
422
+ scores = parse_judge_response(raw_output, criteria, weights)
423
+ return scores, raw_output, warn_list
424
+ except ValueError:
425
+ if attempt == _MAX_JUDGE_RETRIES - 1:
426
+ raise
427
+ logger.debug(
428
+ "Agent judge response did not match schema, retrying (%d/%d)",
429
+ attempt + 1,
430
+ _MAX_JUDGE_RETRIES,
431
+ )
432
+ raise RuntimeError("Unreachable")
433
+ finally:
434
+ if schema_path:
435
+ Path(schema_path).unlink(missing_ok=True)
@@ -14,6 +14,7 @@ Aggregation = Literal["weighted_mean", "all_pass", "any_pass", "threshold"]
14
14
  class OutputFormat(Protocol):
15
15
  def normalize(self, raw: float | bool | str) -> float: ...
16
16
  def prompt_fragment(self) -> str: ...
17
+ def json_schema(self) -> dict[str, Any]: ...
17
18
 
18
19
 
19
20
  class Binary(BaseModel):
@@ -29,6 +30,9 @@ class Binary(BaseModel):
29
30
  def prompt_fragment(self) -> str:
30
31
  return '"yes" or "no"'
31
32
 
33
+ def json_schema(self) -> dict[str, Any]:
34
+ return {"type": "string", "enum": ["yes", "no"]}
35
+
32
36
 
33
37
  class Likert(BaseModel):
34
38
  model_config = ConfigDict(frozen=True)
@@ -43,6 +47,9 @@ class Likert(BaseModel):
43
47
  def prompt_fragment(self) -> str:
44
48
  return f"an integer from 1 to {self.points}"
45
49
 
50
+ def json_schema(self) -> dict[str, Any]:
51
+ return {"type": "integer"}
52
+
46
53
 
47
54
  class Numeric(BaseModel):
48
55
  model_config = ConfigDict(frozen=True)
@@ -59,6 +66,9 @@ class Numeric(BaseModel):
59
66
  def prompt_fragment(self) -> str:
60
67
  return f"a number from {self.min} to {self.max}"
61
68
 
69
+ def json_schema(self) -> dict[str, Any]:
70
+ return {"type": "number"}
71
+
62
72
 
63
73
  def _slugify(text: str) -> str:
64
74
  slug = re.sub(r"[^a-z0-9]+", "_", text[:40].lower())
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import asyncio
4
4
  import inspect
5
+ import warnings
5
6
  from contextlib import nullcontext
6
7
  from pathlib import Path
7
8
  from typing import Any, Awaitable, TypeVar
@@ -102,9 +103,24 @@ class Reward:
102
103
  if isinstance(raw, bool):
103
104
  value = 1.0 if raw else 0.0
104
105
  elif isinstance(raw, (int, float)):
105
- value = max(0.0, min(1.0, float(raw)))
106
+ value = float(raw)
107
+ if value > 1.0:
108
+ warnings.warn(
109
+ f"Criterion {fn_name!r} returned {value:.4f} which exceeds 1.0; "
110
+ f"score will not be clamped — verify your criterion logic.",
111
+ stacklevel=2,
112
+ )
113
+ elif value < 0.0:
114
+ warnings.warn(
115
+ f"Criterion {fn_name!r} returned {value:.4f} which is below 0.0; "
116
+ f"score will not be clamped — verify your criterion logic.",
117
+ stacklevel=2,
118
+ )
106
119
  else:
107
- value = 1.0 if raw else 0.0
120
+ raise TypeError(
121
+ f"Criterion {fn_name!r} returned {type(raw).__name__}, "
122
+ f"expected bool, int, or float."
123
+ )
108
124
 
109
125
  return Score(
110
126
  name=fn_name,
@@ -113,15 +129,13 @@ class Reward:
113
129
  weight=weight,
114
130
  description=description,
115
131
  )
132
+ except ImportError as e:
133
+ raise ImportError(
134
+ f"Criterion {fn_name!r} failed due to missing dependency: {e}. "
135
+ f"Install the required extra (e.g. uv add harbor-rewardkit[all])."
136
+ ) from e
116
137
  except Exception as e:
117
- return Score(
118
- name=fn_name,
119
- value=0.0,
120
- raw=False,
121
- weight=weight,
122
- error=str(e),
123
- description=description,
124
- )
138
+ raise RuntimeError(f"Criterion {fn_name!r} failed: {e}") from e
125
139
 
126
140
  def _run_one(self, i: int, fn: Any) -> Score:
127
141
  """Run a single criterion, with isolation if configured."""
@@ -132,6 +146,16 @@ class Reward:
132
146
  return self._eval_criterion(i, fn, self.workspace)
133
147
 
134
148
  async def arun(self, sem: asyncio.Semaphore | None = None) -> list[Score]:
149
+ try:
150
+ return await self._arun_inner(sem)
151
+ except ExceptionGroup as eg:
152
+ # Unwrap TaskGroup ExceptionGroup to surface the first real error.
153
+ first = eg.exceptions[0]
154
+ raise RuntimeError(f"Reward {self.name!r} failed: {first}") from first
155
+ except Exception as e:
156
+ raise RuntimeError(f"Reward {self.name!r} failed: {e}") from e
157
+
158
+ async def _arun_inner(self, sem: asyncio.Semaphore | None) -> list[Score]:
135
159
  if self.judge is None:
136
160
  tasks: list[asyncio.Task[Score]] = []
137
161
  async with asyncio.TaskGroup() as tg:
@@ -18,6 +18,7 @@ from rewardkit.models import (
18
18
  Criterion,
19
19
  LLMJudge,
20
20
  Likert,
21
+ Numeric,
21
22
  )
22
23
  from rewardkit.reward import Reward
23
24
  from rewardkit.session import Session, _builtin_names, _factory_registry, set_current
@@ -28,6 +29,13 @@ def _load_toml(path: Path) -> dict[str, Any]:
28
29
 
29
30
 
30
31
  def _import_py_file(path: Path) -> None:
32
+ """Import a Python file as a module, caching by file-path hash.
33
+
34
+ Once imported, subsequent calls with the same resolved path are
35
+ no-ops. This is intentional for the primary single-run container
36
+ use case but means repeated ``discover()`` or ``run()`` calls in a
37
+ REPL or notebook will not re-execute already-loaded criterion files.
38
+ """
31
39
  import hashlib
32
40
 
33
41
  digest = hashlib.sha1(str(path.resolve()).encode()).hexdigest()[:12]
@@ -48,6 +56,11 @@ def _build_criteria_from_toml(toml_criteria: list[dict[str, Any]]) -> list[Crite
48
56
  fmt_name = c.get("type", "binary")
49
57
  if fmt_name == "likert":
50
58
  output_format = Likert(points=c.get("points", 5))
59
+ elif fmt_name == "numeric":
60
+ output_format = Numeric(
61
+ min=c.get("min", 0.0),
62
+ max=c.get("max", 1.0),
63
+ )
51
64
  else:
52
65
  output_format = Binary()
53
66
  criteria.append(
@@ -372,12 +385,23 @@ def run_multi(
372
385
  to stdout for overlapping reward names.
373
386
  """
374
387
  all_rewards: list[Reward] = []
375
- dir_labels: list[str] = []
388
+ dir_labels = [Path(d).name for d in tests_dirs]
389
+ if len(dir_labels) != len(set(dir_labels)):
390
+ dupes = {name for name in dir_labels if dir_labels.count(name) > 1}
391
+ paths_by_label = {
392
+ name: [str(d) for d, n in zip(tests_dirs, dir_labels) if n == name]
393
+ for name in dupes
394
+ }
395
+ raise ValueError(
396
+ "Duplicate test directory basenames: "
397
+ + ", ".join(
398
+ f"{name!r} ({', '.join(ps)})" for name, ps in paths_by_label.items()
399
+ )
400
+ + ". Use directories with distinct basenames."
401
+ )
376
402
  dir_reward_ranges: list[tuple[int, int]] = [] # (start, end) indices
377
403
 
378
404
  for tests_dir in tests_dirs:
379
- label = Path(tests_dir).name
380
- dir_labels.append(label)
381
405
  rewards = discover(tests_dir, workspace=workspace)
382
406
  start = len(all_rewards)
383
407
  all_rewards.extend(rewards)