harbor-rewardkit 0.1.dev0__tar.gz → 0.1.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/PKG-INFO +3 -3
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/README.md +2 -2
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/pyproject.toml +1 -1
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/judges.py +13 -2
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/__init__.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/__main__.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/compare.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/__init__.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/_command.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/_trajectory.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/command_output_contains.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/command_output_matches.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/command_output_matches_regex.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/command_succeeds.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/csv_cell_equals.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/diff_ratio.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/file_contains.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/file_contains_regex.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/file_exists.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/file_matches.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/file_not_exists.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/files_equal.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/http_response_contains.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/http_status_equals.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/image_similarity.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/image_size_equals.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/json_key_equals.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/json_path_equals.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/sqlite_query_equals.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/trajectory_tool_not_used.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/trajectory_tool_used.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/trajectory_turn_count.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/xlsx_cell_equals.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/isolation.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/models.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/prompts/agent.md +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/prompts/llm.md +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/prompts/llm_trajectory.md +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/reward.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/runner.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/session.py +0 -0
- {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/trajectory.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: harbor-rewardkit
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.dev1
|
|
4
4
|
Summary: Lightweight grading toolkit for environment-based tasks.
|
|
5
5
|
Keywords: grading,evaluation,rewards,llm,agents,benchmarks
|
|
6
6
|
Author: benediktstroebl
|
|
@@ -35,7 +35,7 @@ The Harbor Rewardkit is a lightweight package to define and run verifiers. Rewar
|
|
|
35
35
|
## Installation
|
|
36
36
|
|
|
37
37
|
```bash
|
|
38
|
-
uv tool install harbor-
|
|
38
|
+
uv tool install harbor-rewardkit
|
|
39
39
|
```
|
|
40
40
|
|
|
41
41
|
## Example: Programmatic criteria
|
|
@@ -67,7 +67,7 @@ Add rewardkit to your `test.sh` file:
|
|
|
67
67
|
|
|
68
68
|
```bash
|
|
69
69
|
# tests/test.sh
|
|
70
|
-
uvx harbor-
|
|
70
|
+
uvx harbor-rewardkit@0.1 /tests
|
|
71
71
|
```
|
|
72
72
|
|
|
73
73
|
See the [documentation](https://harborframework.com/docs/rewardkit) and a full [working example](https://github.com/harbor-framework/harbor/tree/main/examples/tasks/reward-kit-example).
|
|
@@ -8,7 +8,7 @@ The Harbor Rewardkit is a lightweight package to define and run verifiers. Rewar
|
|
|
8
8
|
## Installation
|
|
9
9
|
|
|
10
10
|
```bash
|
|
11
|
-
uv tool install harbor-
|
|
11
|
+
uv tool install harbor-rewardkit
|
|
12
12
|
```
|
|
13
13
|
|
|
14
14
|
## Example: Programmatic criteria
|
|
@@ -40,7 +40,7 @@ Add rewardkit to your `test.sh` file:
|
|
|
40
40
|
|
|
41
41
|
```bash
|
|
42
42
|
# tests/test.sh
|
|
43
|
-
uvx harbor-
|
|
43
|
+
uvx harbor-rewardkit@0.1 /tests
|
|
44
44
|
```
|
|
45
45
|
|
|
46
46
|
See the [documentation](https://harborframework.com/docs/rewardkit) and a full [working example](https://github.com/harbor-framework/harbor/tree/main/examples/tasks/reward-kit-example).
|
|
@@ -319,11 +319,12 @@ async def arun_agent(
|
|
|
319
319
|
cmd = ["claude", "-p", prompt, "--output-format", "json"]
|
|
320
320
|
cmd_name = "claude"
|
|
321
321
|
else:
|
|
322
|
-
cmd = ["codex", "
|
|
322
|
+
cmd = ["codex", "exec", prompt]
|
|
323
323
|
cmd_name = "codex"
|
|
324
324
|
|
|
325
325
|
if judge.model:
|
|
326
|
-
|
|
326
|
+
flag = "-m" if judge.agent == "codex" else "--model"
|
|
327
|
+
cmd.extend([flag, judge.model])
|
|
327
328
|
|
|
328
329
|
_ensure_cli(cmd_name)
|
|
329
330
|
cwd = judge.cwd or (
|
|
@@ -344,6 +345,16 @@ async def arun_agent(
|
|
|
344
345
|
await proc.communicate()
|
|
345
346
|
raise
|
|
346
347
|
raw_output = stdout.decode()
|
|
348
|
+
# Claude CLI with --output-format json wraps the actual response in a
|
|
349
|
+
# JSON envelope with a "result" field. Extract the inner text so
|
|
350
|
+
# parse_judge_response finds the scoring JSON, not the wrapper.
|
|
351
|
+
if judge.agent == "claude-code":
|
|
352
|
+
try:
|
|
353
|
+
envelope = json.loads(raw_output)
|
|
354
|
+
if isinstance(envelope, dict) and "result" in envelope:
|
|
355
|
+
raw_output = envelope["result"]
|
|
356
|
+
except (json.JSONDecodeError, TypeError):
|
|
357
|
+
pass
|
|
347
358
|
try:
|
|
348
359
|
scores = parse_judge_response(raw_output, criteria, weights)
|
|
349
360
|
except (ValueError, json.JSONDecodeError) as e:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/_trajectory.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/command_succeeds.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/csv_cell_equals.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/diff_ratio.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/file_contains.py
RENAMED
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/file_exists.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/file_matches.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/file_not_exists.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/files_equal.py
RENAMED
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/http_status_equals.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/image_similarity.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/image_size_equals.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/json_key_equals.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/json_path_equals.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/xlsx_cell_equals.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/prompts/llm_trajectory.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|