harbor-rewardkit 0.1.dev0__tar.gz → 0.1.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/PKG-INFO +3 -3
  2. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/README.md +2 -2
  3. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/pyproject.toml +1 -1
  4. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/judges.py +13 -2
  5. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/__init__.py +0 -0
  6. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/__main__.py +0 -0
  7. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/compare.py +0 -0
  8. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/__init__.py +0 -0
  9. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/_command.py +0 -0
  10. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/_trajectory.py +0 -0
  11. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/command_output_contains.py +0 -0
  12. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/command_output_matches.py +0 -0
  13. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/command_output_matches_regex.py +0 -0
  14. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/command_succeeds.py +0 -0
  15. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/csv_cell_equals.py +0 -0
  16. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/diff_ratio.py +0 -0
  17. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/file_contains.py +0 -0
  18. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/file_contains_regex.py +0 -0
  19. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/file_exists.py +0 -0
  20. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/file_matches.py +0 -0
  21. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/file_not_exists.py +0 -0
  22. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/files_equal.py +0 -0
  23. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/http_response_contains.py +0 -0
  24. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/http_status_equals.py +0 -0
  25. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/image_similarity.py +0 -0
  26. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/image_size_equals.py +0 -0
  27. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/json_key_equals.py +0 -0
  28. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/json_path_equals.py +0 -0
  29. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/sqlite_query_equals.py +0 -0
  30. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/trajectory_tool_not_used.py +0 -0
  31. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/trajectory_tool_used.py +0 -0
  32. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/trajectory_turn_count.py +0 -0
  33. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/criteria/xlsx_cell_equals.py +0 -0
  34. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/isolation.py +0 -0
  35. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/models.py +0 -0
  36. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/prompts/agent.md +0 -0
  37. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/prompts/llm.md +0 -0
  38. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/prompts/llm_trajectory.md +0 -0
  39. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/reward.py +0 -0
  40. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/runner.py +0 -0
  41. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/session.py +0 -0
  42. {harbor_rewardkit-0.1.dev0 → harbor_rewardkit-0.1.dev1}/src/rewardkit/trajectory.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: harbor-rewardkit
3
- Version: 0.1.dev0
3
+ Version: 0.1.dev1
4
4
  Summary: Lightweight grading toolkit for environment-based tasks.
5
5
  Keywords: grading,evaluation,rewards,llm,agents,benchmarks
6
6
  Author: benediktstroebl
@@ -35,7 +35,7 @@ The Harbor Rewardkit is a lightweight package to define and run verifiers. Rewar
35
35
  ## Installation
36
36
 
37
37
  ```bash
38
- uv tool install harbor-reward-kit
38
+ uv tool install harbor-rewardkit
39
39
  ```
40
40
 
41
41
  ## Example: Programmatic criteria
@@ -67,7 +67,7 @@ Add rewardkit to your `test.sh` file:
67
67
 
68
68
  ```bash
69
69
  # tests/test.sh
70
- uvx harbor-reward-kit@0.1 /tests
70
+ uvx harbor-rewardkit@0.1 /tests
71
71
  ```
72
72
 
73
73
  See the [documentation](https://harborframework.com/docs/rewardkit) and a full [working example](https://github.com/harbor-framework/harbor/tree/main/examples/tasks/reward-kit-example).
@@ -8,7 +8,7 @@ The Harbor Rewardkit is a lightweight package to define and run verifiers. Rewar
8
8
  ## Installation
9
9
 
10
10
  ```bash
11
- uv tool install harbor-reward-kit
11
+ uv tool install harbor-rewardkit
12
12
  ```
13
13
 
14
14
  ## Example: Programmatic criteria
@@ -40,7 +40,7 @@ Add rewardkit to your `test.sh` file:
40
40
 
41
41
  ```bash
42
42
  # tests/test.sh
43
- uvx harbor-reward-kit@0.1 /tests
43
+ uvx harbor-rewardkit@0.1 /tests
44
44
  ```
45
45
 
46
46
  See the [documentation](https://harborframework.com/docs/rewardkit) and a full [working example](https://github.com/harbor-framework/harbor/tree/main/examples/tasks/reward-kit-example).
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "harbor-rewardkit"
3
- version = "0.1.dev0"
3
+ version = "0.1.dev1"
4
4
  description = "Lightweight grading toolkit for environment-based tasks."
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -319,11 +319,12 @@ async def arun_agent(
319
319
  cmd = ["claude", "-p", prompt, "--output-format", "json"]
320
320
  cmd_name = "claude"
321
321
  else:
322
- cmd = ["codex", "-q", prompt]
322
+ cmd = ["codex", "exec", prompt]
323
323
  cmd_name = "codex"
324
324
 
325
325
  if judge.model:
326
- cmd.extend(["--model", judge.model])
326
+ flag = "-m" if judge.agent == "codex" else "--model"
327
+ cmd.extend([flag, judge.model])
327
328
 
328
329
  _ensure_cli(cmd_name)
329
330
  cwd = judge.cwd or (
@@ -344,6 +345,16 @@ async def arun_agent(
344
345
  await proc.communicate()
345
346
  raise
346
347
  raw_output = stdout.decode()
348
+ # Claude CLI with --output-format json wraps the actual response in a
349
+ # JSON envelope with a "result" field. Extract the inner text so
350
+ # parse_judge_response finds the scoring JSON, not the wrapper.
351
+ if judge.agent == "claude-code":
352
+ try:
353
+ envelope = json.loads(raw_output)
354
+ if isinstance(envelope, dict) and "result" in envelope:
355
+ raw_output = envelope["result"]
356
+ except (json.JSONDecodeError, TypeError):
357
+ pass
347
358
  try:
348
359
  scores = parse_judge_response(raw_output, criteria, weights)
349
360
  except (ValueError, json.JSONDecodeError) as e: