harbor-rewardkit 0.1.dev3__tar.gz → 0.1.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/PKG-INFO +1 -1
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/pyproject.toml +1 -1
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/xlsx_cell_equals.py +6 -4
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/isolation.py +1 -1
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/judges.py +142 -42
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/models.py +10 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/reward.py +18 -2
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/runner.py +27 -3
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/README.md +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/__init__.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/__main__.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/compare.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/__init__.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/_command.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/_trajectory.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_output_contains.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_output_matches.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_output_matches_regex.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_succeeds.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/csv_cell_equals.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/diff_ratio.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_contains.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_contains_regex.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_exists.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_matches.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_not_exists.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/files_equal.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/http_response_contains.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/http_status_equals.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/image_similarity.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/image_size_equals.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/json_key_equals.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/json_path_equals.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/sqlite_query_equals.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/trajectory_tool_not_used.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/trajectory_tool_used.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/trajectory_turn_count.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/prompts/agent.md +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/prompts/llm.md +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/prompts/llm_trajectory.md +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/session.py +0 -0
- {harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/trajectory.py +0 -0
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/xlsx_cell_equals.py
RENAMED
|
@@ -21,10 +21,6 @@ def xlsx_cell_equals(
|
|
|
21
21
|
wb = openpyxl.load_workbook(
|
|
22
22
|
str(workspace / path), read_only=True, data_only=True
|
|
23
23
|
)
|
|
24
|
-
ws = wb[sheet] if sheet else wb.active
|
|
25
|
-
value = ws[cell].value
|
|
26
|
-
wb.close()
|
|
27
|
-
return value == expected
|
|
28
24
|
except (FileNotFoundError, OSError) as e:
|
|
29
25
|
if isinstance(e, FileNotFoundError):
|
|
30
26
|
warnings.warn(
|
|
@@ -32,5 +28,11 @@ def xlsx_cell_equals(
|
|
|
32
28
|
stacklevel=2,
|
|
33
29
|
)
|
|
34
30
|
return False
|
|
31
|
+
try:
|
|
32
|
+
ws = wb[sheet] if sheet else wb.active
|
|
33
|
+
value = ws[cell].value
|
|
34
|
+
return value == expected
|
|
35
35
|
except (KeyError, ValueError):
|
|
36
36
|
return False
|
|
37
|
+
finally:
|
|
38
|
+
wb.close()
|
|
@@ -112,8 +112,8 @@ class _Overlay:
|
|
|
112
112
|
def isolate(path: Path) -> Generator[Path, None, None]:
|
|
113
113
|
"""Yield an overlayfs view of *path*. Writes go to a tmpdir; *path* is untouched."""
|
|
114
114
|
ov = _Overlay(path)
|
|
115
|
-
ov.mount()
|
|
116
115
|
try:
|
|
116
|
+
ov.mount()
|
|
117
117
|
yield ov._merged
|
|
118
118
|
finally:
|
|
119
119
|
ov.cleanup()
|
|
@@ -10,6 +10,7 @@ import os
|
|
|
10
10
|
import re
|
|
11
11
|
import shutil
|
|
12
12
|
import subprocess
|
|
13
|
+
import tempfile
|
|
13
14
|
from importlib import resources
|
|
14
15
|
from pathlib import Path
|
|
15
16
|
from typing import Any
|
|
@@ -43,6 +44,28 @@ def _build_criteria_block(criteria: list[Criterion]) -> str:
|
|
|
43
44
|
return "\n".join(lines)
|
|
44
45
|
|
|
45
46
|
|
|
47
|
+
def _build_response_schema(criteria: list[Criterion]) -> dict[str, Any]:
|
|
48
|
+
"""Build a JSON Schema that enforces the expected judge response structure."""
|
|
49
|
+
props: dict[str, Any] = {}
|
|
50
|
+
for c in criteria:
|
|
51
|
+
name = c.name or "criterion"
|
|
52
|
+
props[name] = {
|
|
53
|
+
"type": "object",
|
|
54
|
+
"properties": {
|
|
55
|
+
"score": c.output_format.json_schema(),
|
|
56
|
+
"reasoning": {"type": "string"},
|
|
57
|
+
},
|
|
58
|
+
"required": ["score", "reasoning"],
|
|
59
|
+
"additionalProperties": False,
|
|
60
|
+
}
|
|
61
|
+
return {
|
|
62
|
+
"type": "object",
|
|
63
|
+
"properties": props,
|
|
64
|
+
"required": list(props.keys()),
|
|
65
|
+
"additionalProperties": False,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
46
69
|
def build_prompt(
|
|
47
70
|
criteria: list[Criterion],
|
|
48
71
|
template: str | None = None,
|
|
@@ -129,6 +152,9 @@ def _text_from_blocks(blocks: list[ContentBlock]) -> str:
|
|
|
129
152
|
return "\n\n".join(b["text"] for b in blocks if b.get("type") == "text")
|
|
130
153
|
|
|
131
154
|
|
|
155
|
+
_MAX_JUDGE_RETRIES = 3
|
|
156
|
+
|
|
157
|
+
|
|
132
158
|
def parse_judge_response(
|
|
133
159
|
text: str,
|
|
134
160
|
criteria: list[Criterion],
|
|
@@ -147,8 +173,13 @@ def parse_judge_response(
|
|
|
147
173
|
scores: list[Score] = []
|
|
148
174
|
for i, c in enumerate(criteria):
|
|
149
175
|
cname = c.name or f"criterion_{i}"
|
|
150
|
-
entry = data.get(cname
|
|
151
|
-
|
|
176
|
+
entry = data.get(cname)
|
|
177
|
+
if not isinstance(entry, dict) or "score" not in entry:
|
|
178
|
+
raise ValueError(
|
|
179
|
+
f"Criterion {cname!r}: expected dict with 'score' and 'reasoning', "
|
|
180
|
+
f"got {type(entry).__name__}: {str(entry)[:100]}"
|
|
181
|
+
)
|
|
182
|
+
raw_score = entry["score"]
|
|
152
183
|
reasoning = entry.get("reasoning", "")
|
|
153
184
|
value = c.output_format.normalize(raw_score)
|
|
154
185
|
weight = weights[i] if weights else 1.0
|
|
@@ -202,6 +233,14 @@ async def arun_llm(
|
|
|
202
233
|
)
|
|
203
234
|
available_tokens = max_input_tokens - prompt_tokens - user_tokens - 32_000
|
|
204
235
|
|
|
236
|
+
if available_tokens <= 0:
|
|
237
|
+
raise ValueError(
|
|
238
|
+
f"Trajectory too large to include in judge prompt: "
|
|
239
|
+
f"no token budget remaining "
|
|
240
|
+
f"(prompt={prompt_tokens}, user={user_tokens}, "
|
|
241
|
+
f"limit={max_input_tokens})."
|
|
242
|
+
)
|
|
243
|
+
|
|
205
244
|
traj_text = format_trajectory(
|
|
206
245
|
judge.atif_trajectory,
|
|
207
246
|
max_tokens=available_tokens,
|
|
@@ -213,17 +252,34 @@ async def arun_llm(
|
|
|
213
252
|
messages: list[dict[str, Any]] = [{"role": "system", "content": prompt}]
|
|
214
253
|
if user_blocks:
|
|
215
254
|
messages.append({"role": "user", "content": user_blocks})
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
255
|
+
for attempt in range(_MAX_JUDGE_RETRIES):
|
|
256
|
+
resp = await litellm.acompletion(
|
|
257
|
+
model=judge.model,
|
|
258
|
+
messages=messages,
|
|
259
|
+
response_format={
|
|
260
|
+
"type": "json_schema",
|
|
261
|
+
"json_schema": {
|
|
262
|
+
"name": "judge_response",
|
|
263
|
+
"schema": _build_response_schema(criteria),
|
|
264
|
+
"strict": True,
|
|
265
|
+
},
|
|
266
|
+
},
|
|
267
|
+
timeout=judge.timeout,
|
|
268
|
+
reasoning_effort=judge.reasoning_effort,
|
|
269
|
+
)
|
|
270
|
+
raw_output = resp.choices[0].message.content
|
|
271
|
+
try:
|
|
272
|
+
scores = parse_judge_response(raw_output, criteria, weights)
|
|
273
|
+
return scores, raw_output, warn_list
|
|
274
|
+
except ValueError:
|
|
275
|
+
if attempt == _MAX_JUDGE_RETRIES - 1:
|
|
276
|
+
raise
|
|
277
|
+
logger.debug(
|
|
278
|
+
"Judge response did not match schema, retrying (%d/%d)",
|
|
279
|
+
attempt + 1,
|
|
280
|
+
_MAX_JUDGE_RETRIES,
|
|
281
|
+
)
|
|
282
|
+
raise RuntimeError("Unreachable")
|
|
227
283
|
|
|
228
284
|
|
|
229
285
|
def _is_alpine() -> bool:
|
|
@@ -291,45 +347,89 @@ async def arun_agent(
|
|
|
291
347
|
prompt = build_prompt(criteria, kind="agent")
|
|
292
348
|
if judge.atif_trajectory:
|
|
293
349
|
prompt += f"\n\nThe agent's trajectory is stored at: {judge.atif_trajectory}"
|
|
350
|
+
schema = _build_response_schema(criteria)
|
|
351
|
+
schema_path: str | None = None
|
|
294
352
|
if judge.agent == "claude-code":
|
|
295
|
-
cmd = [
|
|
353
|
+
cmd = [
|
|
354
|
+
"claude",
|
|
355
|
+
"-p",
|
|
356
|
+
prompt,
|
|
357
|
+
"--output-format",
|
|
358
|
+
"json",
|
|
359
|
+
"--json-schema",
|
|
360
|
+
json.dumps(schema),
|
|
361
|
+
]
|
|
296
362
|
cmd_name = "claude"
|
|
297
363
|
else:
|
|
298
|
-
|
|
364
|
+
fd, schema_path = tempfile.mkstemp(suffix=".json")
|
|
365
|
+
with os.fdopen(fd, "w") as f:
|
|
366
|
+
json.dump(schema, f)
|
|
367
|
+
cmd = ["codex", "exec", prompt, "--output-schema", schema_path]
|
|
299
368
|
cmd_name = "codex"
|
|
300
369
|
|
|
301
370
|
if judge.model:
|
|
371
|
+
model_name = judge.model
|
|
372
|
+
# Claude CLI uses bare model names (e.g. "claude-haiku-4-5"),
|
|
373
|
+
# not provider-prefixed ones (e.g. "anthropic/claude-haiku-4-5").
|
|
374
|
+
if judge.agent == "claude-code" and model_name.startswith("anthropic/"):
|
|
375
|
+
model_name = model_name.removeprefix("anthropic/")
|
|
302
376
|
flag = "-m" if judge.agent == "codex" else "--model"
|
|
303
|
-
cmd.extend([flag,
|
|
377
|
+
cmd.extend([flag, model_name])
|
|
304
378
|
|
|
305
379
|
_ensure_cli(cmd_name)
|
|
306
380
|
cwd = judge.cwd or (
|
|
307
381
|
str(workspace) if workspace and Path(workspace).is_dir() else None
|
|
308
382
|
)
|
|
309
|
-
|
|
310
|
-
*cmd,
|
|
311
|
-
stdout=asyncio.subprocess.PIPE,
|
|
312
|
-
stderr=asyncio.subprocess.PIPE,
|
|
313
|
-
cwd=cwd,
|
|
314
|
-
)
|
|
383
|
+
|
|
315
384
|
try:
|
|
316
|
-
|
|
317
|
-
proc
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
385
|
+
for attempt in range(_MAX_JUDGE_RETRIES):
|
|
386
|
+
proc = await asyncio.create_subprocess_exec(
|
|
387
|
+
*cmd,
|
|
388
|
+
stdout=asyncio.subprocess.PIPE,
|
|
389
|
+
stderr=asyncio.subprocess.PIPE,
|
|
390
|
+
cwd=cwd,
|
|
391
|
+
)
|
|
392
|
+
try:
|
|
393
|
+
stdout, _stderr = await asyncio.wait_for(
|
|
394
|
+
proc.communicate(), timeout=judge.timeout
|
|
395
|
+
)
|
|
396
|
+
except asyncio.TimeoutError:
|
|
397
|
+
proc.kill()
|
|
398
|
+
await proc.communicate()
|
|
399
|
+
raise
|
|
400
|
+
raw_output = stdout.decode()
|
|
401
|
+
if proc.returncode != 0:
|
|
402
|
+
stderr_text = _stderr.decode().strip() if _stderr else ""
|
|
403
|
+
raise ValueError(
|
|
404
|
+
f"Agent CLI '{cmd_name}' exited with code {proc.returncode}: "
|
|
405
|
+
f"{stderr_text or raw_output[:200]}"
|
|
406
|
+
)
|
|
407
|
+
# Claude CLI with --output-format json and --json-schema wraps the
|
|
408
|
+
# structured response in an envelope with a "structured_output" field.
|
|
409
|
+
if judge.agent == "claude-code":
|
|
410
|
+
try:
|
|
411
|
+
envelope = json.loads(raw_output)
|
|
412
|
+
if isinstance(envelope, dict):
|
|
413
|
+
if envelope.get("is_error"):
|
|
414
|
+
raise ValueError(
|
|
415
|
+
f"Claude CLI returned an error: {envelope.get('result', raw_output[:200])}"
|
|
416
|
+
)
|
|
417
|
+
if "structured_output" in envelope:
|
|
418
|
+
raw_output = json.dumps(envelope["structured_output"])
|
|
419
|
+
except (json.JSONDecodeError, TypeError):
|
|
420
|
+
pass
|
|
421
|
+
try:
|
|
422
|
+
scores = parse_judge_response(raw_output, criteria, weights)
|
|
423
|
+
return scores, raw_output, warn_list
|
|
424
|
+
except ValueError:
|
|
425
|
+
if attempt == _MAX_JUDGE_RETRIES - 1:
|
|
426
|
+
raise
|
|
427
|
+
logger.debug(
|
|
428
|
+
"Agent judge response did not match schema, retrying (%d/%d)",
|
|
429
|
+
attempt + 1,
|
|
430
|
+
_MAX_JUDGE_RETRIES,
|
|
431
|
+
)
|
|
432
|
+
raise RuntimeError("Unreachable")
|
|
433
|
+
finally:
|
|
434
|
+
if schema_path:
|
|
435
|
+
Path(schema_path).unlink(missing_ok=True)
|
|
@@ -14,6 +14,7 @@ Aggregation = Literal["weighted_mean", "all_pass", "any_pass", "threshold"]
|
|
|
14
14
|
class OutputFormat(Protocol):
|
|
15
15
|
def normalize(self, raw: float | bool | str) -> float: ...
|
|
16
16
|
def prompt_fragment(self) -> str: ...
|
|
17
|
+
def json_schema(self) -> dict[str, Any]: ...
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
class Binary(BaseModel):
|
|
@@ -29,6 +30,9 @@ class Binary(BaseModel):
|
|
|
29
30
|
def prompt_fragment(self) -> str:
|
|
30
31
|
return '"yes" or "no"'
|
|
31
32
|
|
|
33
|
+
def json_schema(self) -> dict[str, Any]:
|
|
34
|
+
return {"type": "string", "enum": ["yes", "no"]}
|
|
35
|
+
|
|
32
36
|
|
|
33
37
|
class Likert(BaseModel):
|
|
34
38
|
model_config = ConfigDict(frozen=True)
|
|
@@ -43,6 +47,9 @@ class Likert(BaseModel):
|
|
|
43
47
|
def prompt_fragment(self) -> str:
|
|
44
48
|
return f"an integer from 1 to {self.points}"
|
|
45
49
|
|
|
50
|
+
def json_schema(self) -> dict[str, Any]:
|
|
51
|
+
return {"type": "integer"}
|
|
52
|
+
|
|
46
53
|
|
|
47
54
|
class Numeric(BaseModel):
|
|
48
55
|
model_config = ConfigDict(frozen=True)
|
|
@@ -59,6 +66,9 @@ class Numeric(BaseModel):
|
|
|
59
66
|
def prompt_fragment(self) -> str:
|
|
60
67
|
return f"a number from {self.min} to {self.max}"
|
|
61
68
|
|
|
69
|
+
def json_schema(self) -> dict[str, Any]:
|
|
70
|
+
return {"type": "number"}
|
|
71
|
+
|
|
62
72
|
|
|
63
73
|
def _slugify(text: str) -> str:
|
|
64
74
|
slug = re.sub(r"[^a-z0-9]+", "_", text[:40].lower())
|
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import inspect
|
|
5
|
+
import warnings
|
|
5
6
|
from contextlib import nullcontext
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import Any, Awaitable, TypeVar
|
|
@@ -102,9 +103,24 @@ class Reward:
|
|
|
102
103
|
if isinstance(raw, bool):
|
|
103
104
|
value = 1.0 if raw else 0.0
|
|
104
105
|
elif isinstance(raw, (int, float)):
|
|
105
|
-
value =
|
|
106
|
+
value = float(raw)
|
|
107
|
+
if value > 1.0:
|
|
108
|
+
warnings.warn(
|
|
109
|
+
f"Criterion {fn_name!r} returned {value:.4f} which exceeds 1.0; "
|
|
110
|
+
f"score will not be clamped — verify your criterion logic.",
|
|
111
|
+
stacklevel=2,
|
|
112
|
+
)
|
|
113
|
+
elif value < 0.0:
|
|
114
|
+
warnings.warn(
|
|
115
|
+
f"Criterion {fn_name!r} returned {value:.4f} which is below 0.0; "
|
|
116
|
+
f"score will not be clamped — verify your criterion logic.",
|
|
117
|
+
stacklevel=2,
|
|
118
|
+
)
|
|
106
119
|
else:
|
|
107
|
-
|
|
120
|
+
raise TypeError(
|
|
121
|
+
f"Criterion {fn_name!r} returned {type(raw).__name__}, "
|
|
122
|
+
f"expected bool, int, or float."
|
|
123
|
+
)
|
|
108
124
|
|
|
109
125
|
return Score(
|
|
110
126
|
name=fn_name,
|
|
@@ -18,6 +18,7 @@ from rewardkit.models import (
|
|
|
18
18
|
Criterion,
|
|
19
19
|
LLMJudge,
|
|
20
20
|
Likert,
|
|
21
|
+
Numeric,
|
|
21
22
|
)
|
|
22
23
|
from rewardkit.reward import Reward
|
|
23
24
|
from rewardkit.session import Session, _builtin_names, _factory_registry, set_current
|
|
@@ -28,6 +29,13 @@ def _load_toml(path: Path) -> dict[str, Any]:
|
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
def _import_py_file(path: Path) -> None:
|
|
32
|
+
"""Import a Python file as a module, caching by file-path hash.
|
|
33
|
+
|
|
34
|
+
Once imported, subsequent calls with the same resolved path are
|
|
35
|
+
no-ops. This is intentional for the primary single-run container
|
|
36
|
+
use case but means repeated ``discover()`` or ``run()`` calls in a
|
|
37
|
+
REPL or notebook will not re-execute already-loaded criterion files.
|
|
38
|
+
"""
|
|
31
39
|
import hashlib
|
|
32
40
|
|
|
33
41
|
digest = hashlib.sha1(str(path.resolve()).encode()).hexdigest()[:12]
|
|
@@ -48,6 +56,11 @@ def _build_criteria_from_toml(toml_criteria: list[dict[str, Any]]) -> list[Crite
|
|
|
48
56
|
fmt_name = c.get("type", "binary")
|
|
49
57
|
if fmt_name == "likert":
|
|
50
58
|
output_format = Likert(points=c.get("points", 5))
|
|
59
|
+
elif fmt_name == "numeric":
|
|
60
|
+
output_format = Numeric(
|
|
61
|
+
min=c.get("min", 0.0),
|
|
62
|
+
max=c.get("max", 1.0),
|
|
63
|
+
)
|
|
51
64
|
else:
|
|
52
65
|
output_format = Binary()
|
|
53
66
|
criteria.append(
|
|
@@ -372,12 +385,23 @@ def run_multi(
|
|
|
372
385
|
to stdout for overlapping reward names.
|
|
373
386
|
"""
|
|
374
387
|
all_rewards: list[Reward] = []
|
|
375
|
-
dir_labels
|
|
388
|
+
dir_labels = [Path(d).name for d in tests_dirs]
|
|
389
|
+
if len(dir_labels) != len(set(dir_labels)):
|
|
390
|
+
dupes = {name for name in dir_labels if dir_labels.count(name) > 1}
|
|
391
|
+
paths_by_label = {
|
|
392
|
+
name: [str(d) for d, n in zip(tests_dirs, dir_labels) if n == name]
|
|
393
|
+
for name in dupes
|
|
394
|
+
}
|
|
395
|
+
raise ValueError(
|
|
396
|
+
"Duplicate test directory basenames: "
|
|
397
|
+
+ ", ".join(
|
|
398
|
+
f"{name!r} ({', '.join(ps)})" for name, ps in paths_by_label.items()
|
|
399
|
+
)
|
|
400
|
+
+ ". Use directories with distinct basenames."
|
|
401
|
+
)
|
|
376
402
|
dir_reward_ranges: list[tuple[int, int]] = [] # (start, end) indices
|
|
377
403
|
|
|
378
404
|
for tests_dir in tests_dirs:
|
|
379
|
-
label = Path(tests_dir).name
|
|
380
|
-
dir_labels.append(label)
|
|
381
405
|
rewards = discover(tests_dir, workspace=workspace)
|
|
382
406
|
start = len(all_rewards)
|
|
383
407
|
all_rewards.extend(rewards)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/_trajectory.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/command_succeeds.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/csv_cell_equals.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/diff_ratio.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_contains.py
RENAMED
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_exists.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_matches.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/file_not_exists.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/files_equal.py
RENAMED
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/http_status_equals.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/image_similarity.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/image_size_equals.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/json_key_equals.py
RENAMED
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/criteria/json_path_equals.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{harbor_rewardkit-0.1.dev3 → harbor_rewardkit-0.1.dev4}/src/rewardkit/prompts/llm_trajectory.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|