harbor-rewardkit 0.1.dev1__tar.gz → 0.1.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/PKG-INFO +1 -1
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/pyproject.toml +1 -1
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/csv_cell_equals.py +9 -1
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/diff_ratio.py +5 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_contains.py +5 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_contains_regex.py +5 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_matches.py +5 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/files_equal.py +5 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/image_similarity.py +7 -4
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/image_size_equals.py +7 -2
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/json_key_equals.py +8 -1
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/json_path_equals.py +11 -1
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/sqlite_query_equals.py +5 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/xlsx_cell_equals.py +13 -3
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/isolation.py +1 -1
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/judges.py +141 -71
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/models.py +10 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/reward.py +34 -10
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/runner.py +27 -3
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/README.md +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/__init__.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/__main__.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/compare.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/__init__.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/_command.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/_trajectory.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_output_contains.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_output_matches.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_output_matches_regex.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_succeeds.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_exists.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_not_exists.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/http_response_contains.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/http_status_equals.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/trajectory_tool_not_used.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/trajectory_tool_used.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/trajectory_turn_count.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/prompts/agent.md +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/prompts/llm.md +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/prompts/llm_trajectory.md +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/session.py +0 -0
- {harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/trajectory.py +0 -0
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/csv_cell_equals.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Criterion: check that a specific CSV cell has the expected value."""
|
|
2
2
|
|
|
3
3
|
import csv
|
|
4
|
+
import warnings
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
from rewardkit.session import criterion
|
|
@@ -29,5 +30,12 @@ def csv_cell_equals(
|
|
|
29
30
|
return r[col] == expected
|
|
30
31
|
return False
|
|
31
32
|
return False
|
|
32
|
-
except (FileNotFoundError, OSError
|
|
33
|
+
except (FileNotFoundError, OSError) as e:
|
|
34
|
+
if isinstance(e, FileNotFoundError):
|
|
35
|
+
warnings.warn(
|
|
36
|
+
f"csv_cell_equals: '{path}' not found in workspace, assigning reward 0",
|
|
37
|
+
stacklevel=2,
|
|
38
|
+
)
|
|
39
|
+
return False
|
|
40
|
+
except (csv.Error, KeyError, IndexError):
|
|
33
41
|
return False
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/diff_ratio.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Criterion: fuzzy text comparison returning a similarity ratio [0, 1]."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
from difflib import SequenceMatcher
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
@@ -11,5 +12,9 @@ def diff_ratio(workspace: Path, path: str, expected: str) -> float:
|
|
|
11
12
|
try:
|
|
12
13
|
content = (workspace / path).read_text()
|
|
13
14
|
except (FileNotFoundError, OSError):
|
|
15
|
+
warnings.warn(
|
|
16
|
+
f"diff_ratio: '{path}' not found in workspace, assigning reward 0",
|
|
17
|
+
stacklevel=2,
|
|
18
|
+
)
|
|
14
19
|
return 0.0
|
|
15
20
|
return SequenceMatcher(None, content.strip(), expected.strip()).ratio()
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_contains.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Criterion: check that a file contains a given string."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
from rewardkit.session import criterion
|
|
@@ -10,4 +11,8 @@ def file_contains(workspace: Path, path: str, text: str) -> bool:
|
|
|
10
11
|
try:
|
|
11
12
|
return text in (workspace / path).read_text()
|
|
12
13
|
except (FileNotFoundError, OSError):
|
|
14
|
+
warnings.warn(
|
|
15
|
+
f"file_contains: '{path}' not found in workspace, assigning reward 0",
|
|
16
|
+
stacklevel=2,
|
|
17
|
+
)
|
|
13
18
|
return False
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Criterion: check that a file contains text matching a regex pattern."""
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
|
+
import warnings
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
from rewardkit.session import criterion
|
|
@@ -11,4 +12,8 @@ def file_contains_regex(workspace: Path, path: str, pattern: str) -> bool:
|
|
|
11
12
|
try:
|
|
12
13
|
return re.search(pattern, (workspace / path).read_text()) is not None
|
|
13
14
|
except (FileNotFoundError, OSError):
|
|
15
|
+
warnings.warn(
|
|
16
|
+
f"file_contains_regex: '{path}' not found in workspace, assigning reward 0",
|
|
17
|
+
stacklevel=2,
|
|
18
|
+
)
|
|
14
19
|
return False
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_matches.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Criterion: check that a file's content matches an expected string."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
from rewardkit.session import criterion
|
|
@@ -12,4 +13,8 @@ def file_matches(workspace: Path, path: str, expected: str) -> bool:
|
|
|
12
13
|
try:
|
|
13
14
|
return (workspace / path).read_text().strip() == expected.strip()
|
|
14
15
|
except (FileNotFoundError, OSError):
|
|
16
|
+
warnings.warn(
|
|
17
|
+
f"file_matches: '{path}' not found in workspace, assigning reward 0",
|
|
18
|
+
stacklevel=2,
|
|
19
|
+
)
|
|
15
20
|
return False
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/files_equal.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Criterion: check that two workspace files have equal content."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
from rewardkit.session import criterion
|
|
@@ -12,4 +13,8 @@ def files_equal(workspace: Path, path1: str, path2: str) -> bool:
|
|
|
12
13
|
workspace / path2
|
|
13
14
|
).read_text().strip()
|
|
14
15
|
except (FileNotFoundError, OSError):
|
|
16
|
+
warnings.warn(
|
|
17
|
+
f"files_equal: '{path1}' or '{path2}' not found in workspace, assigning reward 0",
|
|
18
|
+
stacklevel=2,
|
|
19
|
+
)
|
|
15
20
|
return False
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/image_similarity.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Criterion: compare two images pixel-by-pixel and return similarity ratio."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
from rewardkit.session import criterion
|
|
@@ -12,7 +13,7 @@ def image_similarity(workspace: Path, path1: str, path2: str) -> float:
|
|
|
12
13
|
except ImportError:
|
|
13
14
|
raise ImportError(
|
|
14
15
|
"image_similarity requires Pillow. "
|
|
15
|
-
"Install with:
|
|
16
|
+
"Install with: uv add harbor-rewardkit[image]"
|
|
16
17
|
)
|
|
17
18
|
try:
|
|
18
19
|
with (
|
|
@@ -25,8 +26,6 @@ def image_similarity(workspace: Path, path1: str, path2: str) -> float:
|
|
|
25
26
|
rgba2 = img2.convert("RGBA")
|
|
26
27
|
diff = ImageChops.difference(rgba1, rgba2)
|
|
27
28
|
pixel_count = rgba1.size[0] * rgba1.size[1]
|
|
28
|
-
# A pixel matches iff all 4 RGBA channels have zero difference.
|
|
29
|
-
# Use per-channel point() to build a mask in C rather than Python loops.
|
|
30
29
|
r, g, b, a = diff.split()
|
|
31
30
|
|
|
32
31
|
def zero_fn(x: int) -> int:
|
|
@@ -38,5 +37,9 @@ def image_similarity(workspace: Path, path1: str, path2: str) -> float:
|
|
|
38
37
|
)
|
|
39
38
|
matching = mask.tobytes().count(b"\xff")
|
|
40
39
|
return matching / pixel_count
|
|
41
|
-
except
|
|
40
|
+
except (FileNotFoundError, OSError):
|
|
41
|
+
warnings.warn(
|
|
42
|
+
f"image_similarity: '{path1}' or '{path2}' not found in workspace, assigning reward 0",
|
|
43
|
+
stacklevel=2,
|
|
44
|
+
)
|
|
42
45
|
return 0.0
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/image_size_equals.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Criterion: check that an image has the expected dimensions."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
from rewardkit.session import criterion
|
|
@@ -12,10 +13,14 @@ def image_size_equals(workspace: Path, path: str, width: int, height: int) -> bo
|
|
|
12
13
|
except ImportError:
|
|
13
14
|
raise ImportError(
|
|
14
15
|
"image_size_equals requires Pillow. "
|
|
15
|
-
"Install with:
|
|
16
|
+
"Install with: uv add harbor-rewardkit[image]"
|
|
16
17
|
)
|
|
17
18
|
try:
|
|
18
19
|
with Image.open(workspace / path) as img:
|
|
19
20
|
return img.size == (width, height)
|
|
20
|
-
except
|
|
21
|
+
except (FileNotFoundError, OSError):
|
|
22
|
+
warnings.warn(
|
|
23
|
+
f"image_size_equals: '{path}' not found in workspace, assigning reward 0",
|
|
24
|
+
stacklevel=2,
|
|
25
|
+
)
|
|
21
26
|
return False
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/json_key_equals.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Criterion: check that a JSON file has a specific key-value pair."""
|
|
2
2
|
|
|
3
3
|
import json as _json
|
|
4
|
+
import warnings
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
from rewardkit.session import criterion
|
|
@@ -13,5 +14,11 @@ def json_key_equals(workspace: Path, path: str, key: str, expected: object) -> b
|
|
|
13
14
|
if not isinstance(data, dict):
|
|
14
15
|
return False
|
|
15
16
|
return data.get(key) == expected
|
|
16
|
-
except (FileNotFoundError, OSError
|
|
17
|
+
except (FileNotFoundError, OSError):
|
|
18
|
+
warnings.warn(
|
|
19
|
+
f"json_key_equals: '{path}' not found in workspace, assigning reward 0",
|
|
20
|
+
stacklevel=2,
|
|
21
|
+
)
|
|
22
|
+
return False
|
|
23
|
+
except (ValueError, KeyError):
|
|
17
24
|
return False
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/json_path_equals.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Criterion: check a nested JSON path against an expected value."""
|
|
2
2
|
|
|
3
3
|
import json as _json
|
|
4
|
+
import warnings
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
from rewardkit.session import criterion
|
|
@@ -12,6 +13,15 @@ def json_path_equals(
|
|
|
12
13
|
) -> bool:
|
|
13
14
|
try:
|
|
14
15
|
data = _json.loads((workspace / path).read_text())
|
|
16
|
+
except (FileNotFoundError, OSError):
|
|
17
|
+
warnings.warn(
|
|
18
|
+
f"json_path_equals: '{path}' not found in workspace, assigning reward 0",
|
|
19
|
+
stacklevel=2,
|
|
20
|
+
)
|
|
21
|
+
return False
|
|
22
|
+
except ValueError:
|
|
23
|
+
return False
|
|
24
|
+
try:
|
|
15
25
|
for segment in json_path.split("."):
|
|
16
26
|
if isinstance(data, dict):
|
|
17
27
|
data = data[segment]
|
|
@@ -20,5 +30,5 @@ def json_path_equals(
|
|
|
20
30
|
else:
|
|
21
31
|
return False
|
|
22
32
|
return data == expected
|
|
23
|
-
except (
|
|
33
|
+
except (KeyError, IndexError, TypeError, ValueError):
|
|
24
34
|
return False
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Criterion: run a SQL query against a SQLite database and check the result."""
|
|
2
2
|
|
|
3
3
|
import sqlite3
|
|
4
|
+
import warnings
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
from rewardkit.session import criterion
|
|
@@ -12,6 +13,10 @@ def sqlite_query_equals(
|
|
|
12
13
|
) -> bool:
|
|
13
14
|
target = workspace / db_path
|
|
14
15
|
if not target.exists():
|
|
16
|
+
warnings.warn(
|
|
17
|
+
f"sqlite_query_equals: '{db_path}' not found in workspace, assigning reward 0",
|
|
18
|
+
stacklevel=2,
|
|
19
|
+
)
|
|
15
20
|
return False
|
|
16
21
|
try:
|
|
17
22
|
with sqlite3.connect(str(target)) as conn:
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/xlsx_cell_equals.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Criterion: check that a specific cell in an xlsx file has the expected value."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
from rewardkit.session import criterion
|
|
@@ -14,15 +15,24 @@ def xlsx_cell_equals(
|
|
|
14
15
|
except ImportError:
|
|
15
16
|
raise ImportError(
|
|
16
17
|
"xlsx_cell_equals requires openpyxl. "
|
|
17
|
-
"Install with:
|
|
18
|
+
"Install with: uv add harbor-rewardkit[office]"
|
|
18
19
|
)
|
|
19
20
|
try:
|
|
20
21
|
wb = openpyxl.load_workbook(
|
|
21
22
|
str(workspace / path), read_only=True, data_only=True
|
|
22
23
|
)
|
|
24
|
+
except (FileNotFoundError, OSError) as e:
|
|
25
|
+
if isinstance(e, FileNotFoundError):
|
|
26
|
+
warnings.warn(
|
|
27
|
+
f"xlsx_cell_equals: '{path}' not found in workspace, assigning reward 0",
|
|
28
|
+
stacklevel=2,
|
|
29
|
+
)
|
|
30
|
+
return False
|
|
31
|
+
try:
|
|
23
32
|
ws = wb[sheet] if sheet else wb.active
|
|
24
33
|
value = ws[cell].value
|
|
25
|
-
wb.close()
|
|
26
34
|
return value == expected
|
|
27
|
-
except (
|
|
35
|
+
except (KeyError, ValueError):
|
|
28
36
|
return False
|
|
37
|
+
finally:
|
|
38
|
+
wb.close()
|
|
@@ -112,8 +112,8 @@ class _Overlay:
|
|
|
112
112
|
def isolate(path: Path) -> Generator[Path, None, None]:
|
|
113
113
|
"""Yield an overlayfs view of *path*. Writes go to a tmpdir; *path* is untouched."""
|
|
114
114
|
ov = _Overlay(path)
|
|
115
|
-
ov.mount()
|
|
116
115
|
try:
|
|
116
|
+
ov.mount()
|
|
117
117
|
yield ov._merged
|
|
118
118
|
finally:
|
|
119
119
|
ov.cleanup()
|
|
@@ -10,6 +10,7 @@ import os
|
|
|
10
10
|
import re
|
|
11
11
|
import shutil
|
|
12
12
|
import subprocess
|
|
13
|
+
import tempfile
|
|
13
14
|
from importlib import resources
|
|
14
15
|
from pathlib import Path
|
|
15
16
|
from typing import Any
|
|
@@ -43,6 +44,28 @@ def _build_criteria_block(criteria: list[Criterion]) -> str:
|
|
|
43
44
|
return "\n".join(lines)
|
|
44
45
|
|
|
45
46
|
|
|
47
|
+
def _build_response_schema(criteria: list[Criterion]) -> dict[str, Any]:
|
|
48
|
+
"""Build a JSON Schema that enforces the expected judge response structure."""
|
|
49
|
+
props: dict[str, Any] = {}
|
|
50
|
+
for c in criteria:
|
|
51
|
+
name = c.name or "criterion"
|
|
52
|
+
props[name] = {
|
|
53
|
+
"type": "object",
|
|
54
|
+
"properties": {
|
|
55
|
+
"score": c.output_format.json_schema(),
|
|
56
|
+
"reasoning": {"type": "string"},
|
|
57
|
+
},
|
|
58
|
+
"required": ["score", "reasoning"],
|
|
59
|
+
"additionalProperties": False,
|
|
60
|
+
}
|
|
61
|
+
return {
|
|
62
|
+
"type": "object",
|
|
63
|
+
"properties": props,
|
|
64
|
+
"required": list(props.keys()),
|
|
65
|
+
"additionalProperties": False,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
46
69
|
def build_prompt(
|
|
47
70
|
criteria: list[Criterion],
|
|
48
71
|
template: str | None = None,
|
|
@@ -129,6 +152,9 @@ def _text_from_blocks(blocks: list[ContentBlock]) -> str:
|
|
|
129
152
|
return "\n\n".join(b["text"] for b in blocks if b.get("type") == "text")
|
|
130
153
|
|
|
131
154
|
|
|
155
|
+
_MAX_JUDGE_RETRIES = 3
|
|
156
|
+
|
|
157
|
+
|
|
132
158
|
def parse_judge_response(
|
|
133
159
|
text: str,
|
|
134
160
|
criteria: list[Criterion],
|
|
@@ -147,8 +173,13 @@ def parse_judge_response(
|
|
|
147
173
|
scores: list[Score] = []
|
|
148
174
|
for i, c in enumerate(criteria):
|
|
149
175
|
cname = c.name or f"criterion_{i}"
|
|
150
|
-
entry = data.get(cname
|
|
151
|
-
|
|
176
|
+
entry = data.get(cname)
|
|
177
|
+
if not isinstance(entry, dict) or "score" not in entry:
|
|
178
|
+
raise ValueError(
|
|
179
|
+
f"Criterion {cname!r}: expected dict with 'score' and 'reasoning', "
|
|
180
|
+
f"got {type(entry).__name__}: {str(entry)[:100]}"
|
|
181
|
+
)
|
|
182
|
+
raw_score = entry["score"]
|
|
152
183
|
reasoning = entry.get("reasoning", "")
|
|
153
184
|
value = c.output_format.normalize(raw_score)
|
|
154
185
|
weight = weights[i] if weights else 1.0
|
|
@@ -165,24 +196,6 @@ def parse_judge_response(
|
|
|
165
196
|
return scores
|
|
166
197
|
|
|
167
198
|
|
|
168
|
-
def _fallback_scores(
|
|
169
|
-
criteria: list[Criterion],
|
|
170
|
-
weights: list[float] | None,
|
|
171
|
-
error: str,
|
|
172
|
-
) -> list[Score]:
|
|
173
|
-
return [
|
|
174
|
-
Score(
|
|
175
|
-
name=c.name or f"criterion_{i}",
|
|
176
|
-
value=0.0,
|
|
177
|
-
raw=False,
|
|
178
|
-
weight=weights[i] if weights else 1.0,
|
|
179
|
-
error=error,
|
|
180
|
-
description=c.description,
|
|
181
|
-
)
|
|
182
|
-
for i, c in enumerate(criteria)
|
|
183
|
-
]
|
|
184
|
-
|
|
185
|
-
|
|
186
199
|
async def arun_llm(
|
|
187
200
|
judge: LLMJudge,
|
|
188
201
|
criteria: list[Criterion],
|
|
@@ -220,6 +233,14 @@ async def arun_llm(
|
|
|
220
233
|
)
|
|
221
234
|
available_tokens = max_input_tokens - prompt_tokens - user_tokens - 32_000
|
|
222
235
|
|
|
236
|
+
if available_tokens <= 0:
|
|
237
|
+
raise ValueError(
|
|
238
|
+
f"Trajectory too large to include in judge prompt: "
|
|
239
|
+
f"no token budget remaining "
|
|
240
|
+
f"(prompt={prompt_tokens}, user={user_tokens}, "
|
|
241
|
+
f"limit={max_input_tokens})."
|
|
242
|
+
)
|
|
243
|
+
|
|
223
244
|
traj_text = format_trajectory(
|
|
224
245
|
judge.atif_trajectory,
|
|
225
246
|
max_tokens=available_tokens,
|
|
@@ -231,23 +252,34 @@ async def arun_llm(
|
|
|
231
252
|
messages: list[dict[str, Any]] = [{"role": "system", "content": prompt}]
|
|
232
253
|
if user_blocks:
|
|
233
254
|
messages.append({"role": "user", "content": user_blocks})
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
criteria, weights, f"Failed to parse judge response: {e}"
|
|
255
|
+
for attempt in range(_MAX_JUDGE_RETRIES):
|
|
256
|
+
resp = await litellm.acompletion(
|
|
257
|
+
model=judge.model,
|
|
258
|
+
messages=messages,
|
|
259
|
+
response_format={
|
|
260
|
+
"type": "json_schema",
|
|
261
|
+
"json_schema": {
|
|
262
|
+
"name": "judge_response",
|
|
263
|
+
"schema": _build_response_schema(criteria),
|
|
264
|
+
"strict": True,
|
|
265
|
+
},
|
|
266
|
+
},
|
|
267
|
+
timeout=judge.timeout,
|
|
268
|
+
reasoning_effort=judge.reasoning_effort,
|
|
249
269
|
)
|
|
250
|
-
|
|
270
|
+
raw_output = resp.choices[0].message.content
|
|
271
|
+
try:
|
|
272
|
+
scores = parse_judge_response(raw_output, criteria, weights)
|
|
273
|
+
return scores, raw_output, warn_list
|
|
274
|
+
except ValueError:
|
|
275
|
+
if attempt == _MAX_JUDGE_RETRIES - 1:
|
|
276
|
+
raise
|
|
277
|
+
logger.debug(
|
|
278
|
+
"Judge response did not match schema, retrying (%d/%d)",
|
|
279
|
+
attempt + 1,
|
|
280
|
+
_MAX_JUDGE_RETRIES,
|
|
281
|
+
)
|
|
282
|
+
raise RuntimeError("Unreachable")
|
|
251
283
|
|
|
252
284
|
|
|
253
285
|
def _is_alpine() -> bool:
|
|
@@ -315,51 +347,89 @@ async def arun_agent(
|
|
|
315
347
|
prompt = build_prompt(criteria, kind="agent")
|
|
316
348
|
if judge.atif_trajectory:
|
|
317
349
|
prompt += f"\n\nThe agent's trajectory is stored at: {judge.atif_trajectory}"
|
|
350
|
+
schema = _build_response_schema(criteria)
|
|
351
|
+
schema_path: str | None = None
|
|
318
352
|
if judge.agent == "claude-code":
|
|
319
|
-
cmd = [
|
|
353
|
+
cmd = [
|
|
354
|
+
"claude",
|
|
355
|
+
"-p",
|
|
356
|
+
prompt,
|
|
357
|
+
"--output-format",
|
|
358
|
+
"json",
|
|
359
|
+
"--json-schema",
|
|
360
|
+
json.dumps(schema),
|
|
361
|
+
]
|
|
320
362
|
cmd_name = "claude"
|
|
321
363
|
else:
|
|
322
|
-
|
|
364
|
+
fd, schema_path = tempfile.mkstemp(suffix=".json")
|
|
365
|
+
with os.fdopen(fd, "w") as f:
|
|
366
|
+
json.dump(schema, f)
|
|
367
|
+
cmd = ["codex", "exec", prompt, "--output-schema", schema_path]
|
|
323
368
|
cmd_name = "codex"
|
|
324
369
|
|
|
325
370
|
if judge.model:
|
|
371
|
+
model_name = judge.model
|
|
372
|
+
# Claude CLI uses bare model names (e.g. "claude-haiku-4-5"),
|
|
373
|
+
# not provider-prefixed ones (e.g. "anthropic/claude-haiku-4-5").
|
|
374
|
+
if judge.agent == "claude-code" and model_name.startswith("anthropic/"):
|
|
375
|
+
model_name = model_name.removeprefix("anthropic/")
|
|
326
376
|
flag = "-m" if judge.agent == "codex" else "--model"
|
|
327
|
-
cmd.extend([flag,
|
|
377
|
+
cmd.extend([flag, model_name])
|
|
328
378
|
|
|
329
379
|
_ensure_cli(cmd_name)
|
|
330
380
|
cwd = judge.cwd or (
|
|
331
381
|
str(workspace) if workspace and Path(workspace).is_dir() else None
|
|
332
382
|
)
|
|
333
|
-
|
|
334
|
-
*cmd,
|
|
335
|
-
stdout=asyncio.subprocess.PIPE,
|
|
336
|
-
stderr=asyncio.subprocess.PIPE,
|
|
337
|
-
cwd=cwd,
|
|
338
|
-
)
|
|
339
|
-
try:
|
|
340
|
-
stdout, _stderr = await asyncio.wait_for(
|
|
341
|
-
proc.communicate(), timeout=judge.timeout
|
|
342
|
-
)
|
|
343
|
-
except asyncio.TimeoutError:
|
|
344
|
-
proc.kill()
|
|
345
|
-
await proc.communicate()
|
|
346
|
-
raise
|
|
347
|
-
raw_output = stdout.decode()
|
|
348
|
-
# Claude CLI with --output-format json wraps the actual response in a
|
|
349
|
-
# JSON envelope with a "result" field. Extract the inner text so
|
|
350
|
-
# parse_judge_response finds the scoring JSON, not the wrapper.
|
|
351
|
-
if judge.agent == "claude-code":
|
|
352
|
-
try:
|
|
353
|
-
envelope = json.loads(raw_output)
|
|
354
|
-
if isinstance(envelope, dict) and "result" in envelope:
|
|
355
|
-
raw_output = envelope["result"]
|
|
356
|
-
except (json.JSONDecodeError, TypeError):
|
|
357
|
-
pass
|
|
383
|
+
|
|
358
384
|
try:
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
385
|
+
for attempt in range(_MAX_JUDGE_RETRIES):
|
|
386
|
+
proc = await asyncio.create_subprocess_exec(
|
|
387
|
+
*cmd,
|
|
388
|
+
stdout=asyncio.subprocess.PIPE,
|
|
389
|
+
stderr=asyncio.subprocess.PIPE,
|
|
390
|
+
cwd=cwd,
|
|
391
|
+
)
|
|
392
|
+
try:
|
|
393
|
+
stdout, _stderr = await asyncio.wait_for(
|
|
394
|
+
proc.communicate(), timeout=judge.timeout
|
|
395
|
+
)
|
|
396
|
+
except asyncio.TimeoutError:
|
|
397
|
+
proc.kill()
|
|
398
|
+
await proc.communicate()
|
|
399
|
+
raise
|
|
400
|
+
raw_output = stdout.decode()
|
|
401
|
+
if proc.returncode != 0:
|
|
402
|
+
stderr_text = _stderr.decode().strip() if _stderr else ""
|
|
403
|
+
raise ValueError(
|
|
404
|
+
f"Agent CLI '{cmd_name}' exited with code {proc.returncode}: "
|
|
405
|
+
f"{stderr_text or raw_output[:200]}"
|
|
406
|
+
)
|
|
407
|
+
# Claude CLI with --output-format json and --json-schema wraps the
|
|
408
|
+
# structured response in an envelope with a "structured_output" field.
|
|
409
|
+
if judge.agent == "claude-code":
|
|
410
|
+
try:
|
|
411
|
+
envelope = json.loads(raw_output)
|
|
412
|
+
if isinstance(envelope, dict):
|
|
413
|
+
if envelope.get("is_error"):
|
|
414
|
+
raise ValueError(
|
|
415
|
+
f"Claude CLI returned an error: {envelope.get('result', raw_output[:200])}"
|
|
416
|
+
)
|
|
417
|
+
if "structured_output" in envelope:
|
|
418
|
+
raw_output = json.dumps(envelope["structured_output"])
|
|
419
|
+
except (json.JSONDecodeError, TypeError):
|
|
420
|
+
pass
|
|
421
|
+
try:
|
|
422
|
+
scores = parse_judge_response(raw_output, criteria, weights)
|
|
423
|
+
return scores, raw_output, warn_list
|
|
424
|
+
except ValueError:
|
|
425
|
+
if attempt == _MAX_JUDGE_RETRIES - 1:
|
|
426
|
+
raise
|
|
427
|
+
logger.debug(
|
|
428
|
+
"Agent judge response did not match schema, retrying (%d/%d)",
|
|
429
|
+
attempt + 1,
|
|
430
|
+
_MAX_JUDGE_RETRIES,
|
|
431
|
+
)
|
|
432
|
+
raise RuntimeError("Unreachable")
|
|
433
|
+
finally:
|
|
434
|
+
if schema_path:
|
|
435
|
+
Path(schema_path).unlink(missing_ok=True)
|
|
@@ -14,6 +14,7 @@ Aggregation = Literal["weighted_mean", "all_pass", "any_pass", "threshold"]
|
|
|
14
14
|
class OutputFormat(Protocol):
|
|
15
15
|
def normalize(self, raw: float | bool | str) -> float: ...
|
|
16
16
|
def prompt_fragment(self) -> str: ...
|
|
17
|
+
def json_schema(self) -> dict[str, Any]: ...
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
class Binary(BaseModel):
|
|
@@ -29,6 +30,9 @@ class Binary(BaseModel):
|
|
|
29
30
|
def prompt_fragment(self) -> str:
|
|
30
31
|
return '"yes" or "no"'
|
|
31
32
|
|
|
33
|
+
def json_schema(self) -> dict[str, Any]:
|
|
34
|
+
return {"type": "string", "enum": ["yes", "no"]}
|
|
35
|
+
|
|
32
36
|
|
|
33
37
|
class Likert(BaseModel):
|
|
34
38
|
model_config = ConfigDict(frozen=True)
|
|
@@ -43,6 +47,9 @@ class Likert(BaseModel):
|
|
|
43
47
|
def prompt_fragment(self) -> str:
|
|
44
48
|
return f"an integer from 1 to {self.points}"
|
|
45
49
|
|
|
50
|
+
def json_schema(self) -> dict[str, Any]:
|
|
51
|
+
return {"type": "integer"}
|
|
52
|
+
|
|
46
53
|
|
|
47
54
|
class Numeric(BaseModel):
|
|
48
55
|
model_config = ConfigDict(frozen=True)
|
|
@@ -59,6 +66,9 @@ class Numeric(BaseModel):
|
|
|
59
66
|
def prompt_fragment(self) -> str:
|
|
60
67
|
return f"a number from {self.min} to {self.max}"
|
|
61
68
|
|
|
69
|
+
def json_schema(self) -> dict[str, Any]:
|
|
70
|
+
return {"type": "number"}
|
|
71
|
+
|
|
62
72
|
|
|
63
73
|
def _slugify(text: str) -> str:
|
|
64
74
|
slug = re.sub(r"[^a-z0-9]+", "_", text[:40].lower())
|
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import inspect
|
|
5
|
+
import warnings
|
|
5
6
|
from contextlib import nullcontext
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import Any, Awaitable, TypeVar
|
|
@@ -102,9 +103,24 @@ class Reward:
|
|
|
102
103
|
if isinstance(raw, bool):
|
|
103
104
|
value = 1.0 if raw else 0.0
|
|
104
105
|
elif isinstance(raw, (int, float)):
|
|
105
|
-
value =
|
|
106
|
+
value = float(raw)
|
|
107
|
+
if value > 1.0:
|
|
108
|
+
warnings.warn(
|
|
109
|
+
f"Criterion {fn_name!r} returned {value:.4f} which exceeds 1.0; "
|
|
110
|
+
f"score will not be clamped — verify your criterion logic.",
|
|
111
|
+
stacklevel=2,
|
|
112
|
+
)
|
|
113
|
+
elif value < 0.0:
|
|
114
|
+
warnings.warn(
|
|
115
|
+
f"Criterion {fn_name!r} returned {value:.4f} which is below 0.0; "
|
|
116
|
+
f"score will not be clamped — verify your criterion logic.",
|
|
117
|
+
stacklevel=2,
|
|
118
|
+
)
|
|
106
119
|
else:
|
|
107
|
-
|
|
120
|
+
raise TypeError(
|
|
121
|
+
f"Criterion {fn_name!r} returned {type(raw).__name__}, "
|
|
122
|
+
f"expected bool, int, or float."
|
|
123
|
+
)
|
|
108
124
|
|
|
109
125
|
return Score(
|
|
110
126
|
name=fn_name,
|
|
@@ -113,15 +129,13 @@ class Reward:
|
|
|
113
129
|
weight=weight,
|
|
114
130
|
description=description,
|
|
115
131
|
)
|
|
132
|
+
except ImportError as e:
|
|
133
|
+
raise ImportError(
|
|
134
|
+
f"Criterion {fn_name!r} failed due to missing dependency: {e}. "
|
|
135
|
+
f"Install the required extra (e.g. uv add harbor-rewardkit[all])."
|
|
136
|
+
) from e
|
|
116
137
|
except Exception as e:
|
|
117
|
-
|
|
118
|
-
name=fn_name,
|
|
119
|
-
value=0.0,
|
|
120
|
-
raw=False,
|
|
121
|
-
weight=weight,
|
|
122
|
-
error=str(e),
|
|
123
|
-
description=description,
|
|
124
|
-
)
|
|
138
|
+
raise RuntimeError(f"Criterion {fn_name!r} failed: {e}") from e
|
|
125
139
|
|
|
126
140
|
def _run_one(self, i: int, fn: Any) -> Score:
|
|
127
141
|
"""Run a single criterion, with isolation if configured."""
|
|
@@ -132,6 +146,16 @@ class Reward:
|
|
|
132
146
|
return self._eval_criterion(i, fn, self.workspace)
|
|
133
147
|
|
|
134
148
|
async def arun(self, sem: asyncio.Semaphore | None = None) -> list[Score]:
|
|
149
|
+
try:
|
|
150
|
+
return await self._arun_inner(sem)
|
|
151
|
+
except ExceptionGroup as eg:
|
|
152
|
+
# Unwrap TaskGroup ExceptionGroup to surface the first real error.
|
|
153
|
+
first = eg.exceptions[0]
|
|
154
|
+
raise RuntimeError(f"Reward {self.name!r} failed: {first}") from first
|
|
155
|
+
except Exception as e:
|
|
156
|
+
raise RuntimeError(f"Reward {self.name!r} failed: {e}") from e
|
|
157
|
+
|
|
158
|
+
async def _arun_inner(self, sem: asyncio.Semaphore | None) -> list[Score]:
|
|
135
159
|
if self.judge is None:
|
|
136
160
|
tasks: list[asyncio.Task[Score]] = []
|
|
137
161
|
async with asyncio.TaskGroup() as tg:
|
|
@@ -18,6 +18,7 @@ from rewardkit.models import (
|
|
|
18
18
|
Criterion,
|
|
19
19
|
LLMJudge,
|
|
20
20
|
Likert,
|
|
21
|
+
Numeric,
|
|
21
22
|
)
|
|
22
23
|
from rewardkit.reward import Reward
|
|
23
24
|
from rewardkit.session import Session, _builtin_names, _factory_registry, set_current
|
|
@@ -28,6 +29,13 @@ def _load_toml(path: Path) -> dict[str, Any]:
|
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
def _import_py_file(path: Path) -> None:
|
|
32
|
+
"""Import a Python file as a module, caching by file-path hash.
|
|
33
|
+
|
|
34
|
+
Once imported, subsequent calls with the same resolved path are
|
|
35
|
+
no-ops. This is intentional for the primary single-run container
|
|
36
|
+
use case but means repeated ``discover()`` or ``run()`` calls in a
|
|
37
|
+
REPL or notebook will not re-execute already-loaded criterion files.
|
|
38
|
+
"""
|
|
31
39
|
import hashlib
|
|
32
40
|
|
|
33
41
|
digest = hashlib.sha1(str(path.resolve()).encode()).hexdigest()[:12]
|
|
@@ -48,6 +56,11 @@ def _build_criteria_from_toml(toml_criteria: list[dict[str, Any]]) -> list[Crite
|
|
|
48
56
|
fmt_name = c.get("type", "binary")
|
|
49
57
|
if fmt_name == "likert":
|
|
50
58
|
output_format = Likert(points=c.get("points", 5))
|
|
59
|
+
elif fmt_name == "numeric":
|
|
60
|
+
output_format = Numeric(
|
|
61
|
+
min=c.get("min", 0.0),
|
|
62
|
+
max=c.get("max", 1.0),
|
|
63
|
+
)
|
|
51
64
|
else:
|
|
52
65
|
output_format = Binary()
|
|
53
66
|
criteria.append(
|
|
@@ -372,12 +385,23 @@ def run_multi(
|
|
|
372
385
|
to stdout for overlapping reward names.
|
|
373
386
|
"""
|
|
374
387
|
all_rewards: list[Reward] = []
|
|
375
|
-
dir_labels
|
|
388
|
+
dir_labels = [Path(d).name for d in tests_dirs]
|
|
389
|
+
if len(dir_labels) != len(set(dir_labels)):
|
|
390
|
+
dupes = {name for name in dir_labels if dir_labels.count(name) > 1}
|
|
391
|
+
paths_by_label = {
|
|
392
|
+
name: [str(d) for d, n in zip(tests_dirs, dir_labels) if n == name]
|
|
393
|
+
for name in dupes
|
|
394
|
+
}
|
|
395
|
+
raise ValueError(
|
|
396
|
+
"Duplicate test directory basenames: "
|
|
397
|
+
+ ", ".join(
|
|
398
|
+
f"{name!r} ({', '.join(ps)})" for name, ps in paths_by_label.items()
|
|
399
|
+
)
|
|
400
|
+
+ ". Use directories with distinct basenames."
|
|
401
|
+
)
|
|
376
402
|
dir_reward_ranges: list[tuple[int, int]] = [] # (start, end) indices
|
|
377
403
|
|
|
378
404
|
for tests_dir in tests_dirs:
|
|
379
|
-
label = Path(tests_dir).name
|
|
380
|
-
dir_labels.append(label)
|
|
381
405
|
rewards = discover(tests_dir, workspace=workspace)
|
|
382
406
|
start = len(all_rewards)
|
|
383
407
|
all_rewards.extend(rewards)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/_trajectory.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_succeeds.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_exists.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_not_exists.py
RENAMED
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/http_status_equals.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev1 → harbor_rewardkit-0.1.dev4}/src/rewardkit/prompts/llm_trajectory.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|