harbor-rewardkit 0.1.dev3__tar.gz → 0.1.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/PKG-INFO +1 -1
  2. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/pyproject.toml +1 -1
  3. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/xlsx_cell_equals.py +6 -4
  4. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/isolation.py +1 -1
  5. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/judges.py +142 -42
  6. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/models.py +10 -0
  7. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/reward.py +18 -2
  8. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/runner.py +27 -3
  9. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/README.md +0 -0
  10. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/__init__.py +0 -0
  11. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/__main__.py +0 -0
  12. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/compare.py +0 -0
  13. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/__init__.py +0 -0
  14. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/_command.py +0 -0
  15. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/_trajectory.py +0 -0
  16. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_output_contains.py +0 -0
  17. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_output_matches.py +0 -0
  18. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_output_matches_regex.py +0 -0
  19. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_succeeds.py +0 -0
  20. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/csv_cell_equals.py +0 -0
  21. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/diff_ratio.py +0 -0
  22. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_contains.py +0 -0
  23. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_contains_regex.py +0 -0
  24. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_exists.py +0 -0
  25. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_matches.py +0 -0
  26. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_not_exists.py +0 -0
  27. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/files_equal.py +0 -0
  28. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/http_response_contains.py +0 -0
  29. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/http_status_equals.py +0 -0
  30. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/image_similarity.py +0 -0
  31. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/image_size_equals.py +0 -0
  32. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/json_key_equals.py +0 -0
  33. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/json_path_equals.py +0 -0
  34. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/sqlite_query_equals.py +0 -0
  35. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/trajectory_tool_not_used.py +0 -0
  36. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/trajectory_tool_used.py +0 -0
  37. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/trajectory_turn_count.py +0 -0
  38. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/prompts/agent.md +0 -0
  39. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/prompts/llm.md +0 -0
  40. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/prompts/llm_trajectory.md +0 -0
  41. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/session.py +0 -0
  42. {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/trajectory.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: harbor-rewardkit
3
- Version: 0.1.dev3
3
+ Version: 0.1.dev4
4
4
  Summary: Lightweight grading toolkit for environment-based tasks.
5
5
  Keywords: grading,evaluation,rewards,llm,agents,benchmarks
6
6
  Author: benediktstroebl
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "harbor-rewardkit"
3
- version = "0.1.dev3"
3
+ version = "0.1.dev4"
4
4
  description = "Lightweight grading toolkit for environment-based tasks."
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -21,10 +21,6 @@ def xlsx_cell_equals(
21
21
  wb = openpyxl.load_workbook(
22
22
  str(workspace / path), read_only=True, data_only=True
23
23
  )
24
- ws = wb[sheet] if sheet else wb.active
25
- value = ws[cell].value
26
- wb.close()
27
- return value == expected
28
24
  except (FileNotFoundError, OSError) as e:
29
25
  if isinstance(e, FileNotFoundError):
30
26
  warnings.warn(
@@ -32,5 +28,11 @@ def xlsx_cell_equals(
32
28
  stacklevel=2,
33
29
  )
34
30
  return False
31
+ try:
32
+ ws = wb[sheet] if sheet else wb.active
33
+ value = ws[cell].value
34
+ return value == expected
35
35
  except (KeyError, ValueError):
36
36
  return False
37
+ finally:
38
+ wb.close()
@@ -112,8 +112,8 @@ class _Overlay:
112
112
  def isolate(path: Path) -> Generator[Path, None, None]:
113
113
  """Yield an overlayfs view of *path*. Writes go to a tmpdir; *path* is untouched."""
114
114
  ov = _Overlay(path)
115
- ov.mount()
116
115
  try:
116
+ ov.mount()
117
117
  yield ov._merged
118
118
  finally:
119
119
  ov.cleanup()
@@ -10,6 +10,7 @@ import os
10
10
  import re
11
11
  import shutil
12
12
  import subprocess
13
+ import tempfile
13
14
  from importlib import resources
14
15
  from pathlib import Path
15
16
  from typing import Any
@@ -43,6 +44,28 @@ def _build_criteria_block(criteria: list[Criterion]) -> str:
43
44
  return "\n".join(lines)
44
45
 
45
46
 
47
+ def _build_response_schema(criteria: list[Criterion]) -> dict[str, Any]:
48
+ """Build a JSON Schema that enforces the expected judge response structure."""
49
+ props: dict[str, Any] = {}
50
+ for c in criteria:
51
+ name = c.name or "criterion"
52
+ props[name] = {
53
+ "type": "object",
54
+ "properties": {
55
+ "score": c.output_format.json_schema(),
56
+ "reasoning": {"type": "string"},
57
+ },
58
+ "required": ["score", "reasoning"],
59
+ "additionalProperties": False,
60
+ }
61
+ return {
62
+ "type": "object",
63
+ "properties": props,
64
+ "required": list(props.keys()),
65
+ "additionalProperties": False,
66
+ }
67
+
68
+
46
69
  def build_prompt(
47
70
  criteria: list[Criterion],
48
71
  template: str | None = None,
@@ -129,6 +152,9 @@ def _text_from_blocks(blocks: list[ContentBlock]) -> str:
129
152
  return "\n\n".join(b["text"] for b in blocks if b.get("type") == "text")
130
153
 
131
154
 
155
+ _MAX_JUDGE_RETRIES = 3
156
+
157
+
132
158
  def parse_judge_response(
133
159
  text: str,
134
160
  criteria: list[Criterion],
@@ -147,8 +173,13 @@ def parse_judge_response(
147
173
  scores: list[Score] = []
148
174
  for i, c in enumerate(criteria):
149
175
  cname = c.name or f"criterion_{i}"
150
- entry = data.get(cname, {})
151
- raw_score = entry.get("score", 0)
176
+ entry = data.get(cname)
177
+ if not isinstance(entry, dict) or "score" not in entry:
178
+ raise ValueError(
179
+ f"Criterion {cname!r}: expected dict with 'score' and 'reasoning', "
180
+ f"got {type(entry).__name__}: {str(entry)[:100]}"
181
+ )
182
+ raw_score = entry["score"]
152
183
  reasoning = entry.get("reasoning", "")
153
184
  value = c.output_format.normalize(raw_score)
154
185
  weight = weights[i] if weights else 1.0
@@ -202,6 +233,14 @@ async def arun_llm(
202
233
  )
203
234
  available_tokens = max_input_tokens - prompt_tokens - user_tokens - 32_000
204
235
 
236
+ if available_tokens <= 0:
237
+ raise ValueError(
238
+ f"Trajectory too large to include in judge prompt: "
239
+ f"no token budget remaining "
240
+ f"(prompt={prompt_tokens}, user={user_tokens}, "
241
+ f"limit={max_input_tokens})."
242
+ )
243
+
205
244
  traj_text = format_trajectory(
206
245
  judge.atif_trajectory,
207
246
  max_tokens=available_tokens,
@@ -213,17 +252,34 @@ async def arun_llm(
213
252
  messages: list[dict[str, Any]] = [{"role": "system", "content": prompt}]
214
253
  if user_blocks:
215
254
  messages.append({"role": "user", "content": user_blocks})
216
- resp = await litellm.acompletion(
217
- model=judge.model,
218
- messages=messages,
219
- response_format={"type": "json_object"},
220
- max_tokens=4096,
221
- timeout=judge.timeout,
222
- reasoning_effort=judge.reasoning_effort,
223
- )
224
- raw_output = resp.choices[0].message.content
225
- scores = parse_judge_response(raw_output, criteria, weights)
226
- return scores, raw_output, warn_list
255
+ for attempt in range(_MAX_JUDGE_RETRIES):
256
+ resp = await litellm.acompletion(
257
+ model=judge.model,
258
+ messages=messages,
259
+ response_format={
260
+ "type": "json_schema",
261
+ "json_schema": {
262
+ "name": "judge_response",
263
+ "schema": _build_response_schema(criteria),
264
+ "strict": True,
265
+ },
266
+ },
267
+ timeout=judge.timeout,
268
+ reasoning_effort=judge.reasoning_effort,
269
+ )
270
+ raw_output = resp.choices[0].message.content
271
+ try:
272
+ scores = parse_judge_response(raw_output, criteria, weights)
273
+ return scores, raw_output, warn_list
274
+ except ValueError:
275
+ if attempt == _MAX_JUDGE_RETRIES - 1:
276
+ raise
277
+ logger.debug(
278
+ "Judge response did not match schema, retrying (%d/%d)",
279
+ attempt + 1,
280
+ _MAX_JUDGE_RETRIES,
281
+ )
282
+ raise RuntimeError("Unreachable")
227
283
 
228
284
 
229
285
  def _is_alpine() -> bool:
@@ -291,45 +347,89 @@ async def arun_agent(
291
347
  prompt = build_prompt(criteria, kind="agent")
292
348
  if judge.atif_trajectory:
293
349
  prompt += f"\n\nThe agent's trajectory is stored at: {judge.atif_trajectory}"
350
+ schema = _build_response_schema(criteria)
351
+ schema_path: str | None = None
294
352
  if judge.agent == "claude-code":
295
- cmd = ["claude", "-p", prompt, "--output-format", "json"]
353
+ cmd = [
354
+ "claude",
355
+ "-p",
356
+ prompt,
357
+ "--output-format",
358
+ "json",
359
+ "--json-schema",
360
+ json.dumps(schema),
361
+ ]
296
362
  cmd_name = "claude"
297
363
  else:
298
- cmd = ["codex", "exec", prompt]
364
+ fd, schema_path = tempfile.mkstemp(suffix=".json")
365
+ with os.fdopen(fd, "w") as f:
366
+ json.dump(schema, f)
367
+ cmd = ["codex", "exec", prompt, "--output-schema", schema_path]
299
368
  cmd_name = "codex"
300
369
 
301
370
  if judge.model:
371
+ model_name = judge.model
372
+ # Claude CLI uses bare model names (e.g. "claude-haiku-4-5"),
373
+ # not provider-prefixed ones (e.g. "anthropic/claude-haiku-4-5").
374
+ if judge.agent == "claude-code" and model_name.startswith("anthropic/"):
375
+ model_name = model_name.removeprefix("anthropic/")
302
376
  flag = "-m" if judge.agent == "codex" else "--model"
303
- cmd.extend([flag, judge.model])
377
+ cmd.extend([flag, model_name])
304
378
 
305
379
  _ensure_cli(cmd_name)
306
380
  cwd = judge.cwd or (
307
381
  str(workspace) if workspace and Path(workspace).is_dir() else None
308
382
  )
309
- proc = await asyncio.create_subprocess_exec(
310
- *cmd,
311
- stdout=asyncio.subprocess.PIPE,
312
- stderr=asyncio.subprocess.PIPE,
313
- cwd=cwd,
314
- )
383
+
315
384
  try:
316
- stdout, _stderr = await asyncio.wait_for(
317
- proc.communicate(), timeout=judge.timeout
318
- )
319
- except asyncio.TimeoutError:
320
- proc.kill()
321
- await proc.communicate()
322
- raise
323
- raw_output = stdout.decode()
324
- # Claude CLI with --output-format json wraps the actual response in a
325
- # JSON envelope with a "result" field. Extract the inner text so
326
- # parse_judge_response finds the scoring JSON, not the wrapper.
327
- if judge.agent == "claude-code":
328
- try:
329
- envelope = json.loads(raw_output)
330
- if isinstance(envelope, dict) and "result" in envelope:
331
- raw_output = envelope["result"]
332
- except (json.JSONDecodeError, TypeError):
333
- pass
334
- scores = parse_judge_response(raw_output, criteria, weights)
335
- return scores, raw_output, warn_list
385
+ for attempt in range(_MAX_JUDGE_RETRIES):
386
+ proc = await asyncio.create_subprocess_exec(
387
+ *cmd,
388
+ stdout=asyncio.subprocess.PIPE,
389
+ stderr=asyncio.subprocess.PIPE,
390
+ cwd=cwd,
391
+ )
392
+ try:
393
+ stdout, _stderr = await asyncio.wait_for(
394
+ proc.communicate(), timeout=judge.timeout
395
+ )
396
+ except asyncio.TimeoutError:
397
+ proc.kill()
398
+ await proc.communicate()
399
+ raise
400
+ raw_output = stdout.decode()
401
+ if proc.returncode != 0:
402
+ stderr_text = _stderr.decode().strip() if _stderr else ""
403
+ raise ValueError(
404
+ f"Agent CLI '{cmd_name}' exited with code {proc.returncode}: "
405
+ f"{stderr_text or raw_output[:200]}"
406
+ )
407
+ # Claude CLI with --output-format json and --json-schema wraps the
408
+ # structured response in an envelope with a "structured_output" field.
409
+ if judge.agent == "claude-code":
410
+ try:
411
+ envelope = json.loads(raw_output)
412
+ if isinstance(envelope, dict):
413
+ if envelope.get("is_error"):
414
+ raise ValueError(
415
+ f"Claude CLI returned an error: {envelope.get('result', raw_output[:200])}"
416
+ )
417
+ if "structured_output" in envelope:
418
+ raw_output = json.dumps(envelope["structured_output"])
419
+ except (json.JSONDecodeError, TypeError):
420
+ pass
421
+ try:
422
+ scores = parse_judge_response(raw_output, criteria, weights)
423
+ return scores, raw_output, warn_list
424
+ except ValueError:
425
+ if attempt == _MAX_JUDGE_RETRIES - 1:
426
+ raise
427
+ logger.debug(
428
+ "Agent judge response did not match schema, retrying (%d/%d)",
429
+ attempt + 1,
430
+ _MAX_JUDGE_RETRIES,
431
+ )
432
+ raise RuntimeError("Unreachable")
433
+ finally:
434
+ if schema_path:
435
+ Path(schema_path).unlink(missing_ok=True)
@@ -14,6 +14,7 @@ Aggregation = Literal["weighted_mean", "all_pass", "any_pass", "threshold"]
14
14
  class OutputFormat(Protocol):
15
15
  def normalize(self, raw: float | bool | str) -> float: ...
16
16
  def prompt_fragment(self) -> str: ...
17
+ def json_schema(self) -> dict[str, Any]: ...
17
18
 
18
19
 
19
20
  class Binary(BaseModel):
@@ -29,6 +30,9 @@ class Binary(BaseModel):
29
30
  def prompt_fragment(self) -> str:
30
31
  return '"yes" or "no"'
31
32
 
33
+ def json_schema(self) -> dict[str, Any]:
34
+ return {"type": "string", "enum": ["yes", "no"]}
35
+
32
36
 
33
37
  class Likert(BaseModel):
34
38
  model_config = ConfigDict(frozen=True)
@@ -43,6 +47,9 @@ class Likert(BaseModel):
43
47
  def prompt_fragment(self) -> str:
44
48
  return f"an integer from 1 to {self.points}"
45
49
 
50
+ def json_schema(self) -> dict[str, Any]:
51
+ return {"type": "integer"}
52
+
46
53
 
47
54
  class Numeric(BaseModel):
48
55
  model_config = ConfigDict(frozen=True)
@@ -59,6 +66,9 @@ class Numeric(BaseModel):
59
66
  def prompt_fragment(self) -> str:
60
67
  return f"a number from {self.min} to {self.max}"
61
68
 
69
+ def json_schema(self) -> dict[str, Any]:
70
+ return {"type": "number"}
71
+
62
72
 
63
73
  def _slugify(text: str) -> str:
64
74
  slug = re.sub(r"[^a-z0-9]+", "_", text[:40].lower())
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import asyncio
4
4
  import inspect
5
+ import warnings
5
6
  from contextlib import nullcontext
6
7
  from pathlib import Path
7
8
  from typing import Any, Awaitable, TypeVar
@@ -102,9 +103,24 @@ class Reward:
102
103
  if isinstance(raw, bool):
103
104
  value = 1.0 if raw else 0.0
104
105
  elif isinstance(raw, (int, float)):
105
- value = max(0.0, min(1.0, float(raw)))
106
+ value = float(raw)
107
+ if value > 1.0:
108
+ warnings.warn(
109
+ f"Criterion {fn_name!r} returned {value:.4f} which exceeds 1.0; "
110
+ f"score will not be clamped — verify your criterion logic.",
111
+ stacklevel=2,
112
+ )
113
+ elif value < 0.0:
114
+ warnings.warn(
115
+ f"Criterion {fn_name!r} returned {value:.4f} which is below 0.0; "
116
+ f"score will not be clamped — verify your criterion logic.",
117
+ stacklevel=2,
118
+ )
106
119
  else:
107
- value = 1.0 if raw else 0.0
120
+ raise TypeError(
121
+ f"Criterion {fn_name!r} returned {type(raw).__name__}, "
122
+ f"expected bool, int, or float."
123
+ )
108
124
 
109
125
  return Score(
110
126
  name=fn_name,
@@ -18,6 +18,7 @@ from rewardkit.models import (
18
18
  Criterion,
19
19
  LLMJudge,
20
20
  Likert,
21
+ Numeric,
21
22
  )
22
23
  from rewardkit.reward import Reward
23
24
  from rewardkit.session import Session, _builtin_names, _factory_registry, set_current
@@ -28,6 +29,13 @@ def _load_toml(path: Path) -> dict[str, Any]:
28
29
 
29
30
 
30
31
  def _import_py_file(path: Path) -> None:
32
+ """Import a Python file as a module, caching by file-path hash.
33
+
34
+ Once imported, subsequent calls with the same resolved path are
35
+ no-ops. This is intentional for the primary single-run container
36
+ use case but means repeated ``discover()`` or ``run()`` calls in a
37
+ REPL or notebook will not re-execute already-loaded criterion files.
38
+ """
31
39
  import hashlib
32
40
 
33
41
  digest = hashlib.sha1(str(path.resolve()).encode()).hexdigest()[:12]
@@ -48,6 +56,11 @@ def _build_criteria_from_toml(toml_criteria: list[dict[str, Any]]) -> list[Crite
48
56
  fmt_name = c.get("type", "binary")
49
57
  if fmt_name == "likert":
50
58
  output_format = Likert(points=c.get("points", 5))
59
+ elif fmt_name == "numeric":
60
+ output_format = Numeric(
61
+ min=c.get("min", 0.0),
62
+ max=c.get("max", 1.0),
63
+ )
51
64
  else:
52
65
  output_format = Binary()
53
66
  criteria.append(
@@ -372,12 +385,23 @@ def run_multi(
372
385
  to stdout for overlapping reward names.
373
386
  """
374
387
  all_rewards: list[Reward] = []
375
- dir_labels: list[str] = []
388
+ dir_labels = [Path(d).name for d in tests_dirs]
389
+ if len(dir_labels) != len(set(dir_labels)):
390
+ dupes = {name for name in dir_labels if dir_labels.count(name) > 1}
391
+ paths_by_label = {
392
+ name: [str(d) for d, n in zip(tests_dirs, dir_labels) if n == name]
393
+ for name in dupes
394
+ }
395
+ raise ValueError(
396
+ "Duplicate test directory basenames: "
397
+ + ", ".join(
398
+ f"{name!r} ({', '.join(ps)})" for name, ps in paths_by_label.items()
399
+ )
400
+ + ". Use directories with distinct basenames."
401
+ )
376
402
  dir_reward_ranges: list[tuple[int, int]] = [] # (start, end) indices
377
403
 
378
404
  for tests_dir in tests_dirs:
379
- label = Path(tests_dir).name
380
- dir_labels.append(label)
381
405
  rewards = discover(tests_dir, workspace=workspace)
382
406
  start = len(all_rewards)
383
407
  all_rewards.extend(rewards)