PyPI - harbor-rewardkit - Versions diffs - 0.1.dev0__tar.gz → 0.1.dev1__tar.gz - Mend

@@ -319,11 +319,12 @@ async def arun_agent(
         cmd = ["claude", "-p", prompt, "--output-format", "json"]
         cmd_name = "claude"
     else:
-        cmd = ["codex", "-q", prompt]
+        cmd = ["codex", "exec", prompt]
         cmd_name = "codex"
     if judge.model:
-        cmd.extend(["--model", judge.model])
+        flag = "-m" if judge.agent == "codex" else "--model"
+        cmd.extend([flag, judge.model])
     _ensure_cli(cmd_name)
     cwd = judge.cwd or (
@@ -344,6 +345,16 @@ async def arun_agent(
         await proc.communicate()
         raise
     raw_output = stdout.decode()
+    # Claude CLI with --output-format json wraps the actual response in a
+    # JSON envelope with a "result" field. Extract the inner text so
+    # parse_judge_response finds the scoring JSON, not the wrapper.
+    if judge.agent == "claude-code":
+        try:
+            envelope = json.loads(raw_output)
+            if isinstance(envelope, dict) and "result" in envelope:
+                raw_output = envelope["result"]
+        except (json.JSONDecodeError, TypeError):
+            pass
     try:
         scores = parse_judge_response(raw_output, criteria, weights)
     except (ValueError, json.JSONDecodeError) as e:

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: harbor-rewardkit
-Version: 0.1.dev0
+Version: 0.1.dev1
 Summary: Lightweight grading toolkit for environment-based tasks.
 Keywords: grading,evaluation,rewards,llm,agents,benchmarks
 Author: benediktstroebl
@@ -35,7 +35,7 @@ The Harbor Rewardkit is a lightweight package to define and run verifiers. Rewar
 ## Installation
 ```bash
-uv tool install harbor-reward-kit
+uv tool install harbor-rewardkit
 ```
 ## Example: Programmatic criteria
@@ -67,7 +67,7 @@ Add rewardkit to your `test.sh` file:
 ```bash
 # tests/test.sh
-uvx harbor-reward-kit@0.1 /tests
+uvx harbor-rewardkit@0.1 /tests
 ```
 See the [documentation](https://harborframework.com/docs/rewardkit) and a full [working example](https://github.com/harbor-framework/harbor/tree/main/examples/tasks/reward-kit-example).

@@ -8,7 +8,7 @@ The Harbor Rewardkit is a lightweight package to define and run verifiers. Rewar
 ## Installation
 ```bash
-uv tool install harbor-reward-kit
+uv tool install harbor-rewardkit
 ```
 ## Example: Programmatic criteria
@@ -40,7 +40,7 @@ Add rewardkit to your `test.sh` file:
 ```bash
 # tests/test.sh
-uvx harbor-reward-kit@0.1 /tests
+uvx harbor-rewardkit@0.1 /tests
 ```
 See the [documentation](https://harborframework.com/docs/rewardkit) and a full [working example](https://github.com/harbor-framework/harbor/tree/main/examples/tasks/reward-kit-example).

@@ -1,6 +1,6 @@
 [project]
 name = "harbor-rewardkit"
-version = "0.1.dev0"
+version = "0.1.dev1"
 description = "Lightweight grading toolkit for environment-based tasks."
 readme = "README.md"
 license = "Apache-2.0"

harbor-rewardkit 0.1.dev0__tar.gz → 0.1.dev1__tar.gz