PyPI - harbor-rewardkit - Versions diffs - 0.1.dev3__tar.gz → 0.1.dev4__tar.gz - Mend

harbor-rewardkit 0.1.dev3tar.gz → 0.1.dev4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: harbor-rewardkit
-Version: 0.1.dev3
+Version: 0.1.dev4
 Summary: Lightweight grading toolkit for environment-based tasks.
 Keywords: grading,evaluation,rewards,llm,agents,benchmarks
 Author: benediktstroebl

{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "harbor-rewardkit"
-version = "0.1.dev3"
+version = "0.1.dev4"
 description = "Lightweight grading toolkit for environment-based tasks."
 readme = "README.md"
 license = "Apache-2.0"

{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/xlsx_cell_equals.py RENAMED Viewed

@@ -21,10 +21,6 @@ def xlsx_cell_equals(
         wb = openpyxl.load_workbook(
             str(workspace / path), read_only=True, data_only=True
         )
-        ws = wb[sheet] if sheet else wb.active
-        value = ws[cell].value
-        wb.close()
-        return value == expected
     except (FileNotFoundError, OSError) as e:
         if isinstance(e, FileNotFoundError):
             warnings.warn(
@@ -32,5 +28,11 @@ def xlsx_cell_equals(
                 stacklevel=2,
             )
         return False
+    try:
+        ws = wb[sheet] if sheet else wb.active
+        value = ws[cell].value
+        return value == expected
     except (KeyError, ValueError):
         return False
+    finally:
+        wb.close()

{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/isolation.py RENAMED Viewed

@@ -112,8 +112,8 @@ class _Overlay:
 def isolate(path: Path) -> Generator[Path, None, None]:
     """Yield an overlayfs view of *path*. Writes go to a tmpdir; *path* is untouched."""
     ov = _Overlay(path)
-    ov.mount()
     try:
+        ov.mount()
         yield ov._merged
     finally:
         ov.cleanup()

{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/judges.py RENAMED Viewed

@@ -10,6 +10,7 @@ import os
 import re
 import shutil
 import subprocess
+import tempfile
 from importlib import resources
 from pathlib import Path
 from typing import Any
@@ -43,6 +44,28 @@ def _build_criteria_block(criteria: list[Criterion]) -> str:
     return "\n".join(lines)
+def _build_response_schema(criteria: list[Criterion]) -> dict[str, Any]:
+    """Build a JSON Schema that enforces the expected judge response structure."""
+    props: dict[str, Any] = {}
+    for c in criteria:
+        name = c.name or "criterion"
+        props[name] = {
+            "type": "object",
+            "properties": {
+                "score": c.output_format.json_schema(),
+                "reasoning": {"type": "string"},
+            },
+            "required": ["score", "reasoning"],
+            "additionalProperties": False,
+        }
+    return {
+        "type": "object",
+        "properties": props,
+        "required": list(props.keys()),
+        "additionalProperties": False,
+    }
 def build_prompt(
     criteria: list[Criterion],
     template: str | None = None,
@@ -129,6 +152,9 @@ def _text_from_blocks(blocks: list[ContentBlock]) -> str:
     return "\n\n".join(b["text"] for b in blocks if b.get("type") == "text")
+_MAX_JUDGE_RETRIES = 3
 def parse_judge_response(
     text: str,
     criteria: list[Criterion],
@@ -147,8 +173,13 @@ def parse_judge_response(
     scores: list[Score] = []
     for i, c in enumerate(criteria):
         cname = c.name or f"criterion_{i}"
-        entry = data.get(cname, {})
-        raw_score = entry.get("score", 0)
+        entry = data.get(cname)
+        if not isinstance(entry, dict) or "score" not in entry:
+            raise ValueError(
+                f"Criterion {cname!r}: expected dict with 'score' and 'reasoning', "
+                f"got {type(entry).__name__}: {str(entry)[:100]}"
+            )
+        raw_score = entry["score"]
         reasoning = entry.get("reasoning", "")
         value = c.output_format.normalize(raw_score)
         weight = weights[i] if weights else 1.0
@@ -202,6 +233,14 @@ async def arun_llm(
         )
         available_tokens = max_input_tokens - prompt_tokens - user_tokens - 32_000
+        if available_tokens <= 0:
+            raise ValueError(
+                f"Trajectory too large to include in judge prompt: "
+                f"no token budget remaining "
+                f"(prompt={prompt_tokens}, user={user_tokens}, "
+                f"limit={max_input_tokens})."
+            )
         traj_text = format_trajectory(
             judge.atif_trajectory,
             max_tokens=available_tokens,
@@ -213,17 +252,34 @@ async def arun_llm(
     messages: list[dict[str, Any]] = [{"role": "system", "content": prompt}]
     if user_blocks:
         messages.append({"role": "user", "content": user_blocks})
-    resp = await litellm.acompletion(
-        model=judge.model,
-        messages=messages,
-        response_format={"type": "json_object"},
-        max_tokens=4096,
-        timeout=judge.timeout,
-        reasoning_effort=judge.reasoning_effort,
-    )
-    raw_output = resp.choices[0].message.content
-    scores = parse_judge_response(raw_output, criteria, weights)
-    return scores, raw_output, warn_list
+    for attempt in range(_MAX_JUDGE_RETRIES):
+        resp = await litellm.acompletion(
+            model=judge.model,
+            messages=messages,
+            response_format={
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "judge_response",
+                    "schema": _build_response_schema(criteria),
+                    "strict": True,
+                },
+            },
+            timeout=judge.timeout,
+            reasoning_effort=judge.reasoning_effort,
+        )
+        raw_output = resp.choices[0].message.content
+        try:
+            scores = parse_judge_response(raw_output, criteria, weights)
+            return scores, raw_output, warn_list
+        except ValueError:
+            if attempt == _MAX_JUDGE_RETRIES - 1:
+                raise
+            logger.debug(
+                "Judge response did not match schema, retrying (%d/%d)",
+                attempt + 1,
+                _MAX_JUDGE_RETRIES,
+            )
+    raise RuntimeError("Unreachable")
 def _is_alpine() -> bool:
@@ -291,45 +347,89 @@ async def arun_agent(
         prompt = build_prompt(criteria, kind="agent")
     if judge.atif_trajectory:
         prompt += f"\n\nThe agent's trajectory is stored at: {judge.atif_trajectory}"
+    schema = _build_response_schema(criteria)
+    schema_path: str | None = None
     if judge.agent == "claude-code":
-        cmd = ["claude", "-p", prompt, "--output-format", "json"]
+        cmd = [
+            "claude",
+            "-p",
+            prompt,
+            "--output-format",
+            "json",
+            "--json-schema",
+            json.dumps(schema),
+        ]
         cmd_name = "claude"
     else:
-        cmd = ["codex", "exec", prompt]
+        fd, schema_path = tempfile.mkstemp(suffix=".json")
+        with os.fdopen(fd, "w") as f:
+            json.dump(schema, f)
+        cmd = ["codex", "exec", prompt, "--output-schema", schema_path]
         cmd_name = "codex"
     if judge.model:
+        model_name = judge.model
+        # Claude CLI uses bare model names (e.g. "claude-haiku-4-5"),
+        # not provider-prefixed ones (e.g. "anthropic/claude-haiku-4-5").
+        if judge.agent == "claude-code" and model_name.startswith("anthropic/"):
+            model_name = model_name.removeprefix("anthropic/")
         flag = "-m" if judge.agent == "codex" else "--model"
-        cmd.extend([flag, judge.model])
+        cmd.extend([flag, model_name])
     _ensure_cli(cmd_name)
     cwd = judge.cwd or (
         str(workspace) if workspace and Path(workspace).is_dir() else None
     )
-    proc = await asyncio.create_subprocess_exec(
-        *cmd,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-        cwd=cwd,
-    )
     try:
-        stdout, _stderr = await asyncio.wait_for(
-            proc.communicate(), timeout=judge.timeout
-        )
-    except asyncio.TimeoutError:
-        proc.kill()
-        await proc.communicate()
-        raise
-    raw_output = stdout.decode()
-    # Claude CLI with --output-format json wraps the actual response in a
-    # JSON envelope with a "result" field. Extract the inner text so
-    # parse_judge_response finds the scoring JSON, not the wrapper.
-    if judge.agent == "claude-code":
-        try:
-            envelope = json.loads(raw_output)
-            if isinstance(envelope, dict) and "result" in envelope:
-                raw_output = envelope["result"]
-        except (json.JSONDecodeError, TypeError):
-            pass
-    scores = parse_judge_response(raw_output, criteria, weights)
-    return scores, raw_output, warn_list
+        for attempt in range(_MAX_JUDGE_RETRIES):
+            proc = await asyncio.create_subprocess_exec(
+                *cmd,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+                cwd=cwd,
+            )
+            try:
+                stdout, _stderr = await asyncio.wait_for(
+                    proc.communicate(), timeout=judge.timeout
+                )
+            except asyncio.TimeoutError:
+                proc.kill()
+                await proc.communicate()
+                raise
+            raw_output = stdout.decode()
+            if proc.returncode != 0:
+                stderr_text = _stderr.decode().strip() if _stderr else ""
+                raise ValueError(
+                    f"Agent CLI '{cmd_name}' exited with code {proc.returncode}: "
+                    f"{stderr_text or raw_output[:200]}"
+                )
+            # Claude CLI with --output-format json and --json-schema wraps the
+            # structured response in an envelope with a "structured_output" field.
+            if judge.agent == "claude-code":
+                try:
+                    envelope = json.loads(raw_output)
+                    if isinstance(envelope, dict):
+                        if envelope.get("is_error"):
+                            raise ValueError(
+                                f"Claude CLI returned an error: {envelope.get('result', raw_output[:200])}"
+                            )
+                        if "structured_output" in envelope:
+                            raw_output = json.dumps(envelope["structured_output"])
+                except (json.JSONDecodeError, TypeError):
+                    pass
+            try:
+                scores = parse_judge_response(raw_output, criteria, weights)
+                return scores, raw_output, warn_list
+            except ValueError:
+                if attempt == _MAX_JUDGE_RETRIES - 1:
+                    raise
+                logger.debug(
+                    "Agent judge response did not match schema, retrying (%d/%d)",
+                    attempt + 1,
+                    _MAX_JUDGE_RETRIES,
+                )
+        raise RuntimeError("Unreachable")
+    finally:
+        if schema_path:
+            Path(schema_path).unlink(missing_ok=True)

{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/models.py RENAMED Viewed

@@ -14,6 +14,7 @@ Aggregation = Literal["weighted_mean", "all_pass", "any_pass", "threshold"]
 class OutputFormat(Protocol):
     def normalize(self, raw: float | bool | str) -> float: ...
     def prompt_fragment(self) -> str: ...
+    def json_schema(self) -> dict[str, Any]: ...
 class Binary(BaseModel):
@@ -29,6 +30,9 @@ class Binary(BaseModel):
     def prompt_fragment(self) -> str:
         return '"yes" or "no"'
+    def json_schema(self) -> dict[str, Any]:
+        return {"type": "string", "enum": ["yes", "no"]}
 class Likert(BaseModel):
     model_config = ConfigDict(frozen=True)
@@ -43,6 +47,9 @@ class Likert(BaseModel):
     def prompt_fragment(self) -> str:
         return f"an integer from 1 to {self.points}"
+    def json_schema(self) -> dict[str, Any]:
+        return {"type": "integer"}
 class Numeric(BaseModel):
     model_config = ConfigDict(frozen=True)
@@ -59,6 +66,9 @@ class Numeric(BaseModel):
     def prompt_fragment(self) -> str:
         return f"a number from {self.min} to {self.max}"
+    def json_schema(self) -> dict[str, Any]:
+        return {"type": "number"}
 def _slugify(text: str) -> str:
     slug = re.sub(r"[^a-z0-9]+", "_", text[:40].lower())

{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/reward.py RENAMED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 import asyncio
 import inspect
+import warnings
 from contextlib import nullcontext
 from pathlib import Path
 from typing import Any, Awaitable, TypeVar
@@ -102,9 +103,24 @@ class Reward:
             if isinstance(raw, bool):
                 value = 1.0 if raw else 0.0
             elif isinstance(raw, (int, float)):
-                value = max(0.0, min(1.0, float(raw)))
+                value = float(raw)
+                if value > 1.0:
+                    warnings.warn(
+                        f"Criterion {fn_name!r} returned {value:.4f} which exceeds 1.0; "
+                        f"score will not be clamped — verify your criterion logic.",
+                        stacklevel=2,
+                    )
+                elif value < 0.0:
+                    warnings.warn(
+                        f"Criterion {fn_name!r} returned {value:.4f} which is below 0.0; "
+                        f"score will not be clamped — verify your criterion logic.",
+                        stacklevel=2,
+                    )
             else:
-                value = 1.0 if raw else 0.0
+                raise TypeError(
+                    f"Criterion {fn_name!r} returned {type(raw).__name__}, "
+                    f"expected bool, int, or float."
+                )
             return Score(
                 name=fn_name,

{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/runner.py RENAMED Viewed

@@ -18,6 +18,7 @@ from rewardkit.models import (
     Criterion,
     LLMJudge,
     Likert,
+    Numeric,
 )
 from rewardkit.reward import Reward
 from rewardkit.session import Session, _builtin_names, _factory_registry, set_current
@@ -28,6 +29,13 @@ def _load_toml(path: Path) -> dict[str, Any]:
 def _import_py_file(path: Path) -> None:
+    """Import a Python file as a module, caching by file-path hash.
+    Once imported, subsequent calls with the same resolved path are
+    no-ops.  This is intentional for the primary single-run container
+    use case but means repeated ``discover()`` or ``run()`` calls in a
+    REPL or notebook will not re-execute already-loaded criterion files.
+    """
     import hashlib
     digest = hashlib.sha1(str(path.resolve()).encode()).hexdigest()[:12]
@@ -48,6 +56,11 @@ def _build_criteria_from_toml(toml_criteria: list[dict[str, Any]]) -> list[Crite
         fmt_name = c.get("type", "binary")
         if fmt_name == "likert":
             output_format = Likert(points=c.get("points", 5))
+        elif fmt_name == "numeric":
+            output_format = Numeric(
+                min=c.get("min", 0.0),
+                max=c.get("max", 1.0),
+            )
         else:
             output_format = Binary()
         criteria.append(
@@ -372,12 +385,23 @@ def run_multi(
     to stdout for overlapping reward names.
     """
     all_rewards: list[Reward] = []
-    dir_labels: list[str] = []
+    dir_labels = [Path(d).name for d in tests_dirs]
+    if len(dir_labels) != len(set(dir_labels)):
+        dupes = {name for name in dir_labels if dir_labels.count(name) > 1}
+        paths_by_label = {
+            name: [str(d) for d, n in zip(tests_dirs, dir_labels) if n == name]
+            for name in dupes
+        }
+        raise ValueError(
+            "Duplicate test directory basenames: "
+            + ", ".join(
+                f"{name!r} ({', '.join(ps)})" for name, ps in paths_by_label.items()
+            )
+            + ". Use directories with distinct basenames."
+        )
     dir_reward_ranges: list[tuple[int, int]] = []  # (start, end) indices
     for tests_dir in tests_dirs:
-        label = Path(tests_dir).name
-        dir_labels.append(label)
         rewards = discover(tests_dir, workspace=workspace)
         start = len(all_rewards)
         all_rewards.extend(rewards)