harbor-rewardkit 0.1.dev0__tar.gz → 0.1.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/PKG-INFO +3 -3
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/README.md +2 -2
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/pyproject.toml +1 -1
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/csv_cell_equals.py +9 -1
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/diff_ratio.py +5 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/file_contains.py +5 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/file_contains_regex.py +5 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/file_matches.py +5 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/files_equal.py +5 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/image_similarity.py +7 -4
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/image_size_equals.py +7 -2
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/json_key_equals.py +8 -1
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/json_path_equals.py +11 -1
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/sqlite_query_equals.py +5 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/xlsx_cell_equals.py +10 -2
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/judges.py +15 -34
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/reward.py +16 -8
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/__init__.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/__main__.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/compare.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/__init__.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/_command.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/_trajectory.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/command_output_contains.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/command_output_matches.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/command_output_matches_regex.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/command_succeeds.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/file_exists.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/file_not_exists.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/http_response_contains.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/http_status_equals.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/trajectory_tool_not_used.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/trajectory_tool_used.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/trajectory_turn_count.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/isolation.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/models.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/prompts/agent.md +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/prompts/llm.md +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/prompts/llm_trajectory.md +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/runner.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/session.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/trajectory.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: harbor-rewardkit
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.dev3
|
|
4
4
|
Summary: Lightweight grading toolkit for environment-based tasks.
|
|
5
5
|
Keywords: grading,evaluation,rewards,llm,agents,benchmarks
|
|
6
6
|
Author: benediktstroebl
|
|
@@ -35,7 +35,7 @@ The Harbor Rewardkit is a lightweight package to define and run verifiers. Rewar
|
|
|
35
35
|
## Installation
|
|
36
36
|
|
|
37
37
|
```bash
|
|
38
|
-
uv tool install harbor-
|
|
38
|
+
uv tool install harbor-rewardkit
|
|
39
39
|
```
|
|
40
40
|
|
|
41
41
|
## Example: Programmatic criteria
|
|
@@ -67,7 +67,7 @@ Add rewardkit to your `test.sh` file:
|
|
|
67
67
|
|
|
68
68
|
```bash
|
|
69
69
|
# tests/test.sh
|
|
70
|
-
uvx harbor-
|
|
70
|
+
uvx harbor-rewardkit@0.1 /tests
|
|
71
71
|
```
|
|
72
72
|
|
|
73
73
|
See the [documentation](https://harborframework.com/docs/rewardkit) and a full [working example](https://github.com/harbor-framework/harbor/tree/main/examples/tasks/reward-kit-example).
|
|
@@ -8,7 +8,7 @@ The Harbor Rewardkit is a lightweight package to define and run verifiers. Rewar
|
|
|
8
8
|
## Installation
|
|
9
9
|
|
|
10
10
|
```bash
|
|
11
|
-
uv tool install harbor-
|
|
11
|
+
uv tool install harbor-rewardkit
|
|
12
12
|
```
|
|
13
13
|
|
|
14
14
|
## Example: Programmatic criteria
|
|
@@ -40,7 +40,7 @@ Add rewardkit to your `test.sh` file:
|
|
|
40
40
|
|
|
41
41
|
```bash
|
|
42
42
|
# tests/test.sh
|
|
43
|
-
uvx harbor-
|
|
43
|
+
uvx harbor-rewardkit@0.1 /tests
|
|
44
44
|
```
|
|
45
45
|
|
|
46
46
|
See the [documentation](https://harborframework.com/docs/rewardkit) and a full [working example](https://github.com/harbor-framework/harbor/tree/main/examples/tasks/reward-kit-example).
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/csv_cell_equals.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Criterion: check that a specific CSV cell has the expected value."""
|
|
2
2
|
|
|
3
3
|
import csv
|
|
4
|
+
import warnings
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
from rewardkit.session import criterion
|
|
@@ -29,5 +30,12 @@ def csv_cell_equals(
|
|
|
29
30
|
return r[col] == expected
|
|
30
31
|
return False
|
|
31
32
|
return False
|
|
32
|
-
except (FileNotFoundError, OSError
|
|
33
|
+
except (FileNotFoundError, OSError) as e:
|
|
34
|
+
if isinstance(e, FileNotFoundError):
|
|
35
|
+
warnings.warn(
|
|
36
|
+
f"csv_cell_equals: '{path}' not found in workspace, assigning reward 0",
|
|
37
|
+
stacklevel=2,
|
|
38
|
+
)
|
|
39
|
+
return False
|
|
40
|
+
except (csv.Error, KeyError, IndexError):
|
|
33
41
|
return False
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/diff_ratio.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Criterion: fuzzy text comparison returning a similarity ratio [0, 1]."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
from difflib import SequenceMatcher
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
@@ -11,5 +12,9 @@ def diff_ratio(workspace: Path, path: str, expected: str) -> float:
|
|
|
11
12
|
try:
|
|
12
13
|
content = (workspace / path).read_text()
|
|
13
14
|
except (FileNotFoundError, OSError):
|
|
15
|
+
warnings.warn(
|
|
16
|
+
f"diff_ratio: '{path}' not found in workspace, assigning reward 0",
|
|
17
|
+
stacklevel=2,
|
|
18
|
+
)
|
|
14
19
|
return 0.0
|
|
15
20
|
return SequenceMatcher(None, content.strip(), expected.strip()).ratio()
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/file_contains.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Criterion: check that a file contains a given string."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
from rewardkit.session import criterion
|
|
@@ -10,4 +11,8 @@ def file_contains(workspace: Path, path: str, text: str) -> bool:
|
|
|
10
11
|
try:
|
|
11
12
|
return text in (workspace / path).read_text()
|
|
12
13
|
except (FileNotFoundError, OSError):
|
|
14
|
+
warnings.warn(
|
|
15
|
+
f"file_contains: '{path}' not found in workspace, assigning reward 0",
|
|
16
|
+
stacklevel=2,
|
|
17
|
+
)
|
|
13
18
|
return False
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Criterion: check that a file contains text matching a regex pattern."""
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
|
+
import warnings
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
from rewardkit.session import criterion
|
|
@@ -11,4 +12,8 @@ def file_contains_regex(workspace: Path, path: str, pattern: str) -> bool:
|
|
|
11
12
|
try:
|
|
12
13
|
return re.search(pattern, (workspace / path).read_text()) is not None
|
|
13
14
|
except (FileNotFoundError, OSError):
|
|
15
|
+
warnings.warn(
|
|
16
|
+
f"file_contains_regex: '{path}' not found in workspace, assigning reward 0",
|
|
17
|
+
stacklevel=2,
|
|
18
|
+
)
|
|
14
19
|
return False
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/file_matches.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Criterion: check that a file's content matches an expected string."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
from rewardkit.session import criterion
|
|
@@ -12,4 +13,8 @@ def file_matches(workspace: Path, path: str, expected: str) -> bool:
|
|
|
12
13
|
try:
|
|
13
14
|
return (workspace / path).read_text().strip() == expected.strip()
|
|
14
15
|
except (FileNotFoundError, OSError):
|
|
16
|
+
warnings.warn(
|
|
17
|
+
f"file_matches: '{path}' not found in workspace, assigning reward 0",
|
|
18
|
+
stacklevel=2,
|
|
19
|
+
)
|
|
15
20
|
return False
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/files_equal.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Criterion: check that two workspace files have equal content."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
from rewardkit.session import criterion
|
|
@@ -12,4 +13,8 @@ def files_equal(workspace: Path, path1: str, path2: str) -> bool:
|
|
|
12
13
|
workspace / path2
|
|
13
14
|
).read_text().strip()
|
|
14
15
|
except (FileNotFoundError, OSError):
|
|
16
|
+
warnings.warn(
|
|
17
|
+
f"files_equal: '{path1}' or '{path2}' not found in workspace, assigning reward 0",
|
|
18
|
+
stacklevel=2,
|
|
19
|
+
)
|
|
15
20
|
return False
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/image_similarity.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Criterion: compare two images pixel-by-pixel and return similarity ratio."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
from rewardkit.session import criterion
|
|
@@ -12,7 +13,7 @@ def image_similarity(workspace: Path, path1: str, path2: str) -> float:
|
|
|
12
13
|
except ImportError:
|
|
13
14
|
raise ImportError(
|
|
14
15
|
"image_similarity requires Pillow. "
|
|
15
|
-
"Install with:
|
|
16
|
+
"Install with: uv add harbor-rewardkit[image]"
|
|
16
17
|
)
|
|
17
18
|
try:
|
|
18
19
|
with (
|
|
@@ -25,8 +26,6 @@ def image_similarity(workspace: Path, path1: str, path2: str) -> float:
|
|
|
25
26
|
rgba2 = img2.convert("RGBA")
|
|
26
27
|
diff = ImageChops.difference(rgba1, rgba2)
|
|
27
28
|
pixel_count = rgba1.size[0] * rgba1.size[1]
|
|
28
|
-
# A pixel matches iff all 4 RGBA channels have zero difference.
|
|
29
|
-
# Use per-channel point() to build a mask in C rather than Python loops.
|
|
30
29
|
r, g, b, a = diff.split()
|
|
31
30
|
|
|
32
31
|
def zero_fn(x: int) -> int:
|
|
@@ -38,5 +37,9 @@ def image_similarity(workspace: Path, path1: str, path2: str) -> float:
|
|
|
38
37
|
)
|
|
39
38
|
matching = mask.tobytes().count(b"\xff")
|
|
40
39
|
return matching / pixel_count
|
|
41
|
-
except
|
|
40
|
+
except (FileNotFoundError, OSError):
|
|
41
|
+
warnings.warn(
|
|
42
|
+
f"image_similarity: '{path1}' or '{path2}' not found in workspace, assigning reward 0",
|
|
43
|
+
stacklevel=2,
|
|
44
|
+
)
|
|
42
45
|
return 0.0
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/image_size_equals.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Criterion: check that an image has the expected dimensions."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
from rewardkit.session import criterion
|
|
@@ -12,10 +13,14 @@ def image_size_equals(workspace: Path, path: str, width: int, height: int) -> bo
|
|
|
12
13
|
except ImportError:
|
|
13
14
|
raise ImportError(
|
|
14
15
|
"image_size_equals requires Pillow. "
|
|
15
|
-
"Install with:
|
|
16
|
+
"Install with: uv add harbor-rewardkit[image]"
|
|
16
17
|
)
|
|
17
18
|
try:
|
|
18
19
|
with Image.open(workspace / path) as img:
|
|
19
20
|
return img.size == (width, height)
|
|
20
|
-
except
|
|
21
|
+
except (FileNotFoundError, OSError):
|
|
22
|
+
warnings.warn(
|
|
23
|
+
f"image_size_equals: '{path}' not found in workspace, assigning reward 0",
|
|
24
|
+
stacklevel=2,
|
|
25
|
+
)
|
|
21
26
|
return False
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/json_key_equals.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Criterion: check that a JSON file has a specific key-value pair."""
|
|
2
2
|
|
|
3
3
|
import json as _json
|
|
4
|
+
import warnings
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
from rewardkit.session import criterion
|
|
@@ -13,5 +14,11 @@ def json_key_equals(workspace: Path, path: str, key: str, expected: object) -> b
|
|
|
13
14
|
if not isinstance(data, dict):
|
|
14
15
|
return False
|
|
15
16
|
return data.get(key) == expected
|
|
16
|
-
except (FileNotFoundError, OSError
|
|
17
|
+
except (FileNotFoundError, OSError):
|
|
18
|
+
warnings.warn(
|
|
19
|
+
f"json_key_equals: '{path}' not found in workspace, assigning reward 0",
|
|
20
|
+
stacklevel=2,
|
|
21
|
+
)
|
|
22
|
+
return False
|
|
23
|
+
except (ValueError, KeyError):
|
|
17
24
|
return False
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/json_path_equals.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Criterion: check a nested JSON path against an expected value."""
|
|
2
2
|
|
|
3
3
|
import json as _json
|
|
4
|
+
import warnings
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
from rewardkit.session import criterion
|
|
@@ -12,6 +13,15 @@ def json_path_equals(
|
|
|
12
13
|
) -> bool:
|
|
13
14
|
try:
|
|
14
15
|
data = _json.loads((workspace / path).read_text())
|
|
16
|
+
except (FileNotFoundError, OSError):
|
|
17
|
+
warnings.warn(
|
|
18
|
+
f"json_path_equals: '{path}' not found in workspace, assigning reward 0",
|
|
19
|
+
stacklevel=2,
|
|
20
|
+
)
|
|
21
|
+
return False
|
|
22
|
+
except ValueError:
|
|
23
|
+
return False
|
|
24
|
+
try:
|
|
15
25
|
for segment in json_path.split("."):
|
|
16
26
|
if isinstance(data, dict):
|
|
17
27
|
data = data[segment]
|
|
@@ -20,5 +30,5 @@ def json_path_equals(
|
|
|
20
30
|
else:
|
|
21
31
|
return False
|
|
22
32
|
return data == expected
|
|
23
|
-
except (
|
|
33
|
+
except (KeyError, IndexError, TypeError, ValueError):
|
|
24
34
|
return False
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Criterion: run a SQL query against a SQLite database and check the result."""
|
|
2
2
|
|
|
3
3
|
import sqlite3
|
|
4
|
+
import warnings
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
from rewardkit.session import criterion
|
|
@@ -12,6 +13,10 @@ def sqlite_query_equals(
|
|
|
12
13
|
) -> bool:
|
|
13
14
|
target = workspace / db_path
|
|
14
15
|
if not target.exists():
|
|
16
|
+
warnings.warn(
|
|
17
|
+
f"sqlite_query_equals: '{db_path}' not found in workspace, assigning reward 0",
|
|
18
|
+
stacklevel=2,
|
|
19
|
+
)
|
|
15
20
|
return False
|
|
16
21
|
try:
|
|
17
22
|
with sqlite3.connect(str(target)) as conn:
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/xlsx_cell_equals.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Criterion: check that a specific cell in an xlsx file has the expected value."""
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
from rewardkit.session import criterion
|
|
@@ -14,7 +15,7 @@ def xlsx_cell_equals(
|
|
|
14
15
|
except ImportError:
|
|
15
16
|
raise ImportError(
|
|
16
17
|
"xlsx_cell_equals requires openpyxl. "
|
|
17
|
-
"Install with:
|
|
18
|
+
"Install with: uv add harbor-rewardkit[office]"
|
|
18
19
|
)
|
|
19
20
|
try:
|
|
20
21
|
wb = openpyxl.load_workbook(
|
|
@@ -24,5 +25,12 @@ def xlsx_cell_equals(
|
|
|
24
25
|
value = ws[cell].value
|
|
25
26
|
wb.close()
|
|
26
27
|
return value == expected
|
|
27
|
-
except (FileNotFoundError, OSError
|
|
28
|
+
except (FileNotFoundError, OSError) as e:
|
|
29
|
+
if isinstance(e, FileNotFoundError):
|
|
30
|
+
warnings.warn(
|
|
31
|
+
f"xlsx_cell_equals: '{path}' not found in workspace, assigning reward 0",
|
|
32
|
+
stacklevel=2,
|
|
33
|
+
)
|
|
34
|
+
return False
|
|
35
|
+
except (KeyError, ValueError):
|
|
28
36
|
return False
|
|
@@ -165,24 +165,6 @@ def parse_judge_response(
|
|
|
165
165
|
return scores
|
|
166
166
|
|
|
167
167
|
|
|
168
|
-
def _fallback_scores(
|
|
169
|
-
criteria: list[Criterion],
|
|
170
|
-
weights: list[float] | None,
|
|
171
|
-
error: str,
|
|
172
|
-
) -> list[Score]:
|
|
173
|
-
return [
|
|
174
|
-
Score(
|
|
175
|
-
name=c.name or f"criterion_{i}",
|
|
176
|
-
value=0.0,
|
|
177
|
-
raw=False,
|
|
178
|
-
weight=weights[i] if weights else 1.0,
|
|
179
|
-
error=error,
|
|
180
|
-
description=c.description,
|
|
181
|
-
)
|
|
182
|
-
for i, c in enumerate(criteria)
|
|
183
|
-
]
|
|
184
|
-
|
|
185
|
-
|
|
186
168
|
async def arun_llm(
|
|
187
169
|
judge: LLMJudge,
|
|
188
170
|
criteria: list[Criterion],
|
|
@@ -240,13 +222,7 @@ async def arun_llm(
|
|
|
240
222
|
reasoning_effort=judge.reasoning_effort,
|
|
241
223
|
)
|
|
242
224
|
raw_output = resp.choices[0].message.content
|
|
243
|
-
|
|
244
|
-
scores = parse_judge_response(raw_output, criteria, weights)
|
|
245
|
-
except (ValueError, json.JSONDecodeError) as e:
|
|
246
|
-
warn_list.append(f"Judge response parse error: {e}")
|
|
247
|
-
scores = _fallback_scores(
|
|
248
|
-
criteria, weights, f"Failed to parse judge response: {e}"
|
|
249
|
-
)
|
|
225
|
+
scores = parse_judge_response(raw_output, criteria, weights)
|
|
250
226
|
return scores, raw_output, warn_list
|
|
251
227
|
|
|
252
228
|
|
|
@@ -319,11 +295,12 @@ async def arun_agent(
|
|
|
319
295
|
cmd = ["claude", "-p", prompt, "--output-format", "json"]
|
|
320
296
|
cmd_name = "claude"
|
|
321
297
|
else:
|
|
322
|
-
cmd = ["codex", "
|
|
298
|
+
cmd = ["codex", "exec", prompt]
|
|
323
299
|
cmd_name = "codex"
|
|
324
300
|
|
|
325
301
|
if judge.model:
|
|
326
|
-
|
|
302
|
+
flag = "-m" if judge.agent == "codex" else "--model"
|
|
303
|
+
cmd.extend([flag, judge.model])
|
|
327
304
|
|
|
328
305
|
_ensure_cli(cmd_name)
|
|
329
306
|
cwd = judge.cwd or (
|
|
@@ -344,11 +321,15 @@ async def arun_agent(
|
|
|
344
321
|
await proc.communicate()
|
|
345
322
|
raise
|
|
346
323
|
raw_output = stdout.decode()
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
324
|
+
# Claude CLI with --output-format json wraps the actual response in a
|
|
325
|
+
# JSON envelope with a "result" field. Extract the inner text so
|
|
326
|
+
# parse_judge_response finds the scoring JSON, not the wrapper.
|
|
327
|
+
if judge.agent == "claude-code":
|
|
328
|
+
try:
|
|
329
|
+
envelope = json.loads(raw_output)
|
|
330
|
+
if isinstance(envelope, dict) and "result" in envelope:
|
|
331
|
+
raw_output = envelope["result"]
|
|
332
|
+
except (json.JSONDecodeError, TypeError):
|
|
333
|
+
pass
|
|
334
|
+
scores = parse_judge_response(raw_output, criteria, weights)
|
|
354
335
|
return scores, raw_output, warn_list
|
|
@@ -113,15 +113,13 @@ class Reward:
|
|
|
113
113
|
weight=weight,
|
|
114
114
|
description=description,
|
|
115
115
|
)
|
|
116
|
+
except ImportError as e:
|
|
117
|
+
raise ImportError(
|
|
118
|
+
f"Criterion {fn_name!r} failed due to missing dependency: {e}. "
|
|
119
|
+
f"Install the required extra (e.g. uv add harbor-rewardkit[all])."
|
|
120
|
+
) from e
|
|
116
121
|
except Exception as e:
|
|
117
|
-
|
|
118
|
-
name=fn_name,
|
|
119
|
-
value=0.0,
|
|
120
|
-
raw=False,
|
|
121
|
-
weight=weight,
|
|
122
|
-
error=str(e),
|
|
123
|
-
description=description,
|
|
124
|
-
)
|
|
122
|
+
raise RuntimeError(f"Criterion {fn_name!r} failed: {e}") from e
|
|
125
123
|
|
|
126
124
|
def _run_one(self, i: int, fn: Any) -> Score:
|
|
127
125
|
"""Run a single criterion, with isolation if configured."""
|
|
@@ -132,6 +130,16 @@ class Reward:
|
|
|
132
130
|
return self._eval_criterion(i, fn, self.workspace)
|
|
133
131
|
|
|
134
132
|
async def arun(self, sem: asyncio.Semaphore | None = None) -> list[Score]:
|
|
133
|
+
try:
|
|
134
|
+
return await self._arun_inner(sem)
|
|
135
|
+
except ExceptionGroup as eg:
|
|
136
|
+
# Unwrap TaskGroup ExceptionGroup to surface the first real error.
|
|
137
|
+
first = eg.exceptions[0]
|
|
138
|
+
raise RuntimeError(f"Reward {self.name!r} failed: {first}") from first
|
|
139
|
+
except Exception as e:
|
|
140
|
+
raise RuntimeError(f"Reward {self.name!r} failed: {e}") from e
|
|
141
|
+
|
|
142
|
+
async def _arun_inner(self, sem: asyncio.Semaphore | None) -> list[Score]:
|
|
135
143
|
if self.judge is None:
|
|
136
144
|
tasks: list[asyncio.Task[Score]] = []
|
|
137
145
|
async with asyncio.TaskGroup() as tg:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/_trajectory.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/command_succeeds.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/file_exists.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/file_not_exists.py
RENAMED
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/criteria/http_status_equals.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev3}/src/rewardkit/prompts/llm_trajectory.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|