harbor-rewardkit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- harbor_rewardkit-0.1.0/PKG-INFO +73 -0
- harbor_rewardkit-0.1.0/README.md +46 -0
- harbor_rewardkit-0.1.0/pyproject.toml +42 -0
- harbor_rewardkit-0.1.0/src/rewardkit/__init__.py +37 -0
- harbor_rewardkit-0.1.0/src/rewardkit/__main__.py +90 -0
- harbor_rewardkit-0.1.0/src/rewardkit/compare.py +91 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/__init__.py +63 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/_command.py +30 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/_trajectory.py +31 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/command_output_contains.py +18 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/command_output_matches.py +18 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/command_output_matches_regex.py +19 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/command_succeeds.py +17 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/csv_cell_equals.py +33 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/diff_ratio.py +15 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/file_contains.py +13 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/file_contains_regex.py +14 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/file_exists.py +10 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/file_matches.py +15 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/file_not_exists.py +10 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/files_equal.py +15 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/http_response_contains.py +24 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/http_status_equals.py +26 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/image_similarity.py +42 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/image_size_equals.py +21 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/json_key_equals.py +17 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/json_path_equals.py +24 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/sqlite_query_equals.py +21 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/trajectory_tool_not_used.py +19 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/trajectory_tool_used.py +21 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/trajectory_turn_count.py +24 -0
- harbor_rewardkit-0.1.0/src/rewardkit/criteria/xlsx_cell_equals.py +28 -0
- harbor_rewardkit-0.1.0/src/rewardkit/isolation.py +126 -0
- harbor_rewardkit-0.1.0/src/rewardkit/judges.py +354 -0
- harbor_rewardkit-0.1.0/src/rewardkit/models.py +131 -0
- harbor_rewardkit-0.1.0/src/rewardkit/prompts/agent.md +3 -0
- harbor_rewardkit-0.1.0/src/rewardkit/prompts/llm.md +3 -0
- harbor_rewardkit-0.1.0/src/rewardkit/prompts/llm_trajectory.md +3 -0
- harbor_rewardkit-0.1.0/src/rewardkit/reward.py +208 -0
- harbor_rewardkit-0.1.0/src/rewardkit/runner.py +409 -0
- harbor_rewardkit-0.1.0/src/rewardkit/session.py +137 -0
- harbor_rewardkit-0.1.0/src/rewardkit/trajectory.py +132 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: harbor-rewardkit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight grading toolkit for environment-based tasks.
|
|
5
|
+
Keywords: grading,evaluation,rewards,llm,agents,benchmarks
|
|
6
|
+
Author: benediktstroebl
|
|
7
|
+
License-Expression: Apache-2.0
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Topic :: Software Development :: Testing
|
|
15
|
+
Requires-Dist: litellm>=1.83
|
|
16
|
+
Requires-Dist: openpyxl>=3.1 ; extra == 'all'
|
|
17
|
+
Requires-Dist: pillow>=10.0 ; extra == 'all'
|
|
18
|
+
Requires-Dist: pillow>=10.0 ; extra == 'image'
|
|
19
|
+
Requires-Dist: openpyxl>=3.1 ; extra == 'office'
|
|
20
|
+
Requires-Python: >=3.12
|
|
21
|
+
Project-URL: Repository, https://github.com/harbor-framework/harbor
|
|
22
|
+
Project-URL: Issues, https://github.com/harbor-framework/harbor/issues
|
|
23
|
+
Provides-Extra: all
|
|
24
|
+
Provides-Extra: image
|
|
25
|
+
Provides-Extra: office
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# rewardkit
|
|
29
|
+
|
|
30
|
+
[](https://discord.gg/6xWPKhGDbA)
|
|
31
|
+
[](https://harborframework.com/docs/rewardkit)
|
|
32
|
+
|
|
33
|
+
A lightweight grading toolkit for defining and running verifiers that output structured reward scores.
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
uv tool install rewardkit
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Example: Programmatic criteria
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
# tests/check.py
|
|
45
|
+
from rewardkit import criteria
|
|
46
|
+
|
|
47
|
+
criteria.file_exists("output.txt")
|
|
48
|
+
criteria.file_contains("output.txt", "hello")
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Example: LLM judge
|
|
52
|
+
|
|
53
|
+
```toml
|
|
54
|
+
# tests/quality.toml
|
|
55
|
+
[judge]
|
|
56
|
+
judge = "anthropic/claude-sonnet-4-6"
|
|
57
|
+
files = ["/app/main.py"]
|
|
58
|
+
|
|
59
|
+
[[criterion]]
|
|
60
|
+
description = "Is the code correct?"
|
|
61
|
+
type = "binary"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Usage
|
|
65
|
+
|
|
66
|
+
Add rewardkit to your `test.sh` file:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# tests/test.sh
|
|
70
|
+
uvx rewardkit /tests
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
See the [documentation](https://harborframework.com/docs/rewardkit) and a full [working example](https://github.com/harbor-framework/harbor/tree/main/examples/tasks/reward-kit-example).
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# rewardkit
|
|
2
|
+
|
|
3
|
+
[](https://discord.gg/6xWPKhGDbA)
|
|
4
|
+
[](https://harborframework.com/docs/rewardkit)
|
|
5
|
+
|
|
6
|
+
A lightweight grading toolkit for defining and running verifiers that output structured reward scores.
|
|
7
|
+
|
|
8
|
+
## Installation
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
uv tool install rewardkit
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Example: Programmatic criteria
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
# tests/check.py
|
|
18
|
+
from rewardkit import criteria
|
|
19
|
+
|
|
20
|
+
criteria.file_exists("output.txt")
|
|
21
|
+
criteria.file_contains("output.txt", "hello")
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Example: LLM judge
|
|
25
|
+
|
|
26
|
+
```toml
|
|
27
|
+
# tests/quality.toml
|
|
28
|
+
[judge]
|
|
29
|
+
judge = "anthropic/claude-sonnet-4-6"
|
|
30
|
+
files = ["/app/main.py"]
|
|
31
|
+
|
|
32
|
+
[[criterion]]
|
|
33
|
+
description = "Is the code correct?"
|
|
34
|
+
type = "binary"
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
|
|
39
|
+
Add rewardkit to your `test.sh` file:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# tests/test.sh
|
|
43
|
+
uvx rewardkit /tests
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
See the [documentation](https://harborframework.com/docs/rewardkit) and a full [working example](https://github.com/harbor-framework/harbor/tree/main/examples/tasks/reward-kit-example).
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "harbor-rewardkit"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Lightweight grading toolkit for environment-based tasks."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "Apache-2.0"
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "benediktstroebl" },
|
|
9
|
+
]
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
keywords = ["grading", "evaluation", "rewards", "llm", "agents", "benchmarks"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 4 - Beta",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"License :: OSI Approved :: Apache Software License",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
"Programming Language :: Python :: 3.13",
|
|
19
|
+
"Topic :: Software Development :: Testing",
|
|
20
|
+
]
|
|
21
|
+
dependencies = [
|
|
22
|
+
"litellm>=1.83",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.optional-dependencies]
|
|
26
|
+
office = ["openpyxl>=3.1"]
|
|
27
|
+
image = ["Pillow>=10.0"]
|
|
28
|
+
all = ["openpyxl>=3.1", "Pillow>=10.0"]
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
Repository = "https://github.com/harbor-framework/harbor"
|
|
32
|
+
Issues = "https://github.com/harbor-framework/harbor/issues"
|
|
33
|
+
|
|
34
|
+
[project.scripts]
|
|
35
|
+
rewardkit = "rewardkit.__main__:main"
|
|
36
|
+
|
|
37
|
+
[build-system]
|
|
38
|
+
requires = ["uv_build>=0.10.8,<0.11.0"]
|
|
39
|
+
build-backend = "uv_build"
|
|
40
|
+
|
|
41
|
+
[tool.uv.build-backend]
|
|
42
|
+
module-name = "rewardkit"
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from rewardkit.compare import ComparisonResult, compare, format_comparison
|
|
2
|
+
from rewardkit.models import (
|
|
3
|
+
Aggregation,
|
|
4
|
+
AgentJudge,
|
|
5
|
+
Binary,
|
|
6
|
+
Criterion,
|
|
7
|
+
LLMJudge,
|
|
8
|
+
Likert,
|
|
9
|
+
Numeric,
|
|
10
|
+
OutputFormat,
|
|
11
|
+
Score,
|
|
12
|
+
)
|
|
13
|
+
from rewardkit.reward import Reward
|
|
14
|
+
from rewardkit.runner import discover, run, run_multi
|
|
15
|
+
from rewardkit.session import criterion
|
|
16
|
+
from rewardkit.trajectory import format_trajectory
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"Aggregation",
|
|
20
|
+
"AgentJudge",
|
|
21
|
+
"Binary",
|
|
22
|
+
"ComparisonResult",
|
|
23
|
+
"Criterion",
|
|
24
|
+
"LLMJudge",
|
|
25
|
+
"Likert",
|
|
26
|
+
"Numeric",
|
|
27
|
+
"OutputFormat",
|
|
28
|
+
"Reward",
|
|
29
|
+
"Score",
|
|
30
|
+
"compare",
|
|
31
|
+
"criterion",
|
|
32
|
+
"discover",
|
|
33
|
+
"format_comparison",
|
|
34
|
+
"format_trajectory",
|
|
35
|
+
"run",
|
|
36
|
+
"run_multi",
|
|
37
|
+
]
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""CLI entry point: ``python -m rewardkit``."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
|
|
7
|
+
from rewardkit.runner import run, run_multi
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main() -> None:
|
|
11
|
+
parser = argparse.ArgumentParser(
|
|
12
|
+
prog="rewardkit",
|
|
13
|
+
description="Discover and run folder-based rewards.",
|
|
14
|
+
)
|
|
15
|
+
parser.add_argument(
|
|
16
|
+
"tests_dirs",
|
|
17
|
+
nargs="+",
|
|
18
|
+
help="Path(s) to tests directory. Multiple dirs run independently and are compared.",
|
|
19
|
+
)
|
|
20
|
+
parser.add_argument(
|
|
21
|
+
"--workspace", default="/app", help="Workspace path (default: /app)"
|
|
22
|
+
)
|
|
23
|
+
parser.add_argument(
|
|
24
|
+
"--output",
|
|
25
|
+
default="/logs/verifier/reward.json",
|
|
26
|
+
help="Output JSON path (default: /logs/verifier/reward.json)",
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"--max-concurrent-programmatic",
|
|
30
|
+
"--mcprog",
|
|
31
|
+
"--mcp",
|
|
32
|
+
type=int,
|
|
33
|
+
default=8,
|
|
34
|
+
help="Max programmatic rewards to run in parallel (0 = unlimited, default: 8)",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--max-concurrent-llm",
|
|
38
|
+
"--mcllm",
|
|
39
|
+
"--mcl",
|
|
40
|
+
type=int,
|
|
41
|
+
default=8,
|
|
42
|
+
help="Max LLM judge calls to run in parallel (0 = unlimited, default: 8)",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--max-concurrent-agent",
|
|
46
|
+
"--mcagent",
|
|
47
|
+
"--mca",
|
|
48
|
+
type=int,
|
|
49
|
+
default=2,
|
|
50
|
+
help="Max agent judge calls to run in parallel (0 = unlimited, default: 2)",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
args = parser.parse_args()
|
|
54
|
+
concurrency_kwargs = dict(
|
|
55
|
+
max_concurrent_programmatic=args.max_concurrent_programmatic,
|
|
56
|
+
max_concurrent_llm=args.max_concurrent_llm,
|
|
57
|
+
max_concurrent_agent=args.max_concurrent_agent,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
if len(args.tests_dirs) == 1:
|
|
61
|
+
result = run(
|
|
62
|
+
args.tests_dirs[0],
|
|
63
|
+
workspace=args.workspace,
|
|
64
|
+
output=args.output,
|
|
65
|
+
**concurrency_kwargs,
|
|
66
|
+
)
|
|
67
|
+
for name, score in result.items():
|
|
68
|
+
print(f"{name}: {score}")
|
|
69
|
+
else:
|
|
70
|
+
# Multiple test suites: run independently, compare
|
|
71
|
+
per_dir = run_multi(
|
|
72
|
+
args.tests_dirs,
|
|
73
|
+
workspace=args.workspace,
|
|
74
|
+
output=args.output,
|
|
75
|
+
**concurrency_kwargs,
|
|
76
|
+
)
|
|
77
|
+
for label, scores in per_dir.items():
|
|
78
|
+
for name, score in scores.items():
|
|
79
|
+
print(f"{label}/{name}: {score}")
|
|
80
|
+
|
|
81
|
+
from rewardkit.compare import format_comparison
|
|
82
|
+
|
|
83
|
+
table = format_comparison(per_dir)
|
|
84
|
+
if table:
|
|
85
|
+
print()
|
|
86
|
+
print(table)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
if __name__ == "__main__":
|
|
90
|
+
main()
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Comparison utilities for multi-dir reward results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ComparisonResult(BaseModel):
|
|
9
|
+
"""Comparison of reward scores across multiple test directories."""
|
|
10
|
+
|
|
11
|
+
model_config = ConfigDict(frozen=True)
|
|
12
|
+
|
|
13
|
+
labels: list[str]
|
|
14
|
+
per_reward: dict[str, dict[str, float]] = Field(default_factory=dict)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def compare(
|
|
18
|
+
results: dict[str, dict[str, float]],
|
|
19
|
+
) -> ComparisonResult:
|
|
20
|
+
"""Compare results from multiple test directories.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
results: Mapping of ``dir_label -> {reward_name: score}``.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
A :class:`ComparisonResult` with overlapping reward names mapped
|
|
27
|
+
to per-label scores.
|
|
28
|
+
"""
|
|
29
|
+
labels = list(results.keys())
|
|
30
|
+
if len(labels) < 2:
|
|
31
|
+
return ComparisonResult(labels=labels)
|
|
32
|
+
|
|
33
|
+
all_names: set[str] = set()
|
|
34
|
+
for scores in results.values():
|
|
35
|
+
all_names.update(scores.keys())
|
|
36
|
+
|
|
37
|
+
per_reward: dict[str, dict[str, float]] = {}
|
|
38
|
+
for name in sorted(all_names):
|
|
39
|
+
entry: dict[str, float] = {}
|
|
40
|
+
for label in labels:
|
|
41
|
+
score = results[label].get(name)
|
|
42
|
+
if score is not None:
|
|
43
|
+
entry[label] = score
|
|
44
|
+
if len(entry) >= 2:
|
|
45
|
+
per_reward[name] = entry
|
|
46
|
+
|
|
47
|
+
return ComparisonResult(labels=labels, per_reward=per_reward)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def format_comparison(results: dict[str, dict[str, float]]) -> str:
|
|
51
|
+
"""Format a comparison table for printing to stdout.
|
|
52
|
+
|
|
53
|
+
Returns an empty string if there are fewer than 2 dirs or no overlapping
|
|
54
|
+
reward names.
|
|
55
|
+
"""
|
|
56
|
+
cr = compare(results)
|
|
57
|
+
if not cr.per_reward:
|
|
58
|
+
return ""
|
|
59
|
+
|
|
60
|
+
labels = cr.labels
|
|
61
|
+
name_width = max(len("reward"), max(len(n) for n in cr.per_reward))
|
|
62
|
+
col_widths = {label: max(len(label), 6) for label in labels}
|
|
63
|
+
|
|
64
|
+
header = "reward".ljust(name_width)
|
|
65
|
+
for label in labels:
|
|
66
|
+
header += " " + label.rjust(col_widths[label])
|
|
67
|
+
header += " " + "diff".rjust(6)
|
|
68
|
+
|
|
69
|
+
sep = "-" * len(header)
|
|
70
|
+
lines = ["Comparison:", sep, header, sep]
|
|
71
|
+
|
|
72
|
+
for name, scores in cr.per_reward.items():
|
|
73
|
+
row = name.ljust(name_width)
|
|
74
|
+
values = []
|
|
75
|
+
for label in labels:
|
|
76
|
+
val = scores.get(label)
|
|
77
|
+
if val is not None:
|
|
78
|
+
row += " " + f"{val:.4f}".rjust(col_widths[label])
|
|
79
|
+
values.append(val)
|
|
80
|
+
else:
|
|
81
|
+
row += " " + "-".rjust(col_widths[label])
|
|
82
|
+
if len(values) >= 2:
|
|
83
|
+
diff = values[0] - values[-1]
|
|
84
|
+
sign = "+" if diff > 0 else ""
|
|
85
|
+
row += " " + f"{sign}{diff:.4f}".rjust(6)
|
|
86
|
+
else:
|
|
87
|
+
row += " " + "-".rjust(6)
|
|
88
|
+
lines.append(row)
|
|
89
|
+
|
|
90
|
+
lines.append(sep)
|
|
91
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Criterion functions for rewardkit.
|
|
2
|
+
|
|
3
|
+
All criteria — built-in and user-defined — are resolved via the global
|
|
4
|
+
``_factory_registry`` in :mod:`rewardkit.session`. User-defined criteria
|
|
5
|
+
registered with ``@criterion`` override built-ins of the same name.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import importlib as _importlib
|
|
9
|
+
import sys as _sys
|
|
10
|
+
|
|
11
|
+
from rewardkit.session import _builtin_names, _factory_registry
|
|
12
|
+
|
|
13
|
+
# Import built-in criterion modules so their @criterion decorators execute
|
|
14
|
+
# and populate _factory_registry.
|
|
15
|
+
_BUILTIN_MODULES = [
|
|
16
|
+
"command_output_contains",
|
|
17
|
+
"command_output_matches",
|
|
18
|
+
"command_output_matches_regex",
|
|
19
|
+
"command_succeeds",
|
|
20
|
+
"csv_cell_equals",
|
|
21
|
+
"diff_ratio",
|
|
22
|
+
"file_contains",
|
|
23
|
+
"file_contains_regex",
|
|
24
|
+
"file_exists",
|
|
25
|
+
"file_matches",
|
|
26
|
+
"file_not_exists",
|
|
27
|
+
"files_equal",
|
|
28
|
+
"http_response_contains",
|
|
29
|
+
"http_status_equals",
|
|
30
|
+
"image_similarity",
|
|
31
|
+
"image_size_equals",
|
|
32
|
+
"json_key_equals",
|
|
33
|
+
"json_path_equals",
|
|
34
|
+
"sqlite_query_equals",
|
|
35
|
+
"trajectory_tool_not_used",
|
|
36
|
+
"trajectory_tool_used",
|
|
37
|
+
"trajectory_turn_count",
|
|
38
|
+
"xlsx_cell_equals",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
for _name in _BUILTIN_MODULES:
|
|
42
|
+
_importlib.import_module(f"rewardkit.criteria.{_name}")
|
|
43
|
+
|
|
44
|
+
# Mark currently registered names as built-in so user overrides produce a warning.
|
|
45
|
+
_builtin_names.update(_factory_registry)
|
|
46
|
+
|
|
47
|
+
# Python sets submodule attributes on the parent package.
|
|
48
|
+
# Remove them so all lookups go through __getattr__, which checks
|
|
49
|
+
# _factory_registry — this lets user-defined criteria override built-ins.
|
|
50
|
+
_this = _sys.modules[__name__]
|
|
51
|
+
for _name in _BUILTIN_MODULES:
|
|
52
|
+
delattr(_this, _name)
|
|
53
|
+
|
|
54
|
+
del _name, _this, _importlib, _sys
|
|
55
|
+
|
|
56
|
+
__all__ = list(_BUILTIN_MODULES)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def __getattr__(name: str): # noqa: ANN204
|
|
60
|
+
"""Resolve criteria from the global factory registry."""
|
|
61
|
+
if name in _factory_registry:
|
|
62
|
+
return _factory_registry[name]
|
|
63
|
+
raise AttributeError(f"module 'rewardkit.criteria' has no attribute {name!r}")
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Shared subprocess helper for command-based criteria."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import subprocess
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def run_command(
|
|
10
|
+
workspace: Path,
|
|
11
|
+
cmd: str,
|
|
12
|
+
cwd: str | None = None,
|
|
13
|
+
timeout: int = 30,
|
|
14
|
+
) -> subprocess.CompletedProcess[str] | None:
|
|
15
|
+
"""Run a shell command in the workspace directory.
|
|
16
|
+
|
|
17
|
+
Returns the CompletedProcess on success, or None on timeout.
|
|
18
|
+
"""
|
|
19
|
+
run_cwd = str(workspace / cwd) if cwd else str(workspace)
|
|
20
|
+
try:
|
|
21
|
+
return subprocess.run(
|
|
22
|
+
cmd,
|
|
23
|
+
shell=True,
|
|
24
|
+
cwd=run_cwd,
|
|
25
|
+
capture_output=True,
|
|
26
|
+
text=True,
|
|
27
|
+
timeout=timeout,
|
|
28
|
+
)
|
|
29
|
+
except subprocess.TimeoutExpired:
|
|
30
|
+
return None
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Shared helpers for trajectory-based criteria."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def load_trajectory(path: str | Path) -> dict | None:
|
|
10
|
+
"""Load an ATIF trajectory JSON file. Returns None on error."""
|
|
11
|
+
p = Path(path)
|
|
12
|
+
if not p.exists():
|
|
13
|
+
return None
|
|
14
|
+
try:
|
|
15
|
+
return json.loads(p.read_text())
|
|
16
|
+
except (json.JSONDecodeError, OSError):
|
|
17
|
+
return None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def count_agent_turns(data: dict) -> int:
|
|
21
|
+
"""Count the number of steps with source == 'agent'."""
|
|
22
|
+
return sum(1 for s in data.get("steps", []) if s.get("source") == "agent")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def collect_tool_calls(data: dict) -> list[dict]:
|
|
26
|
+
"""Collect all tool calls across all steps."""
|
|
27
|
+
calls: list[dict] = []
|
|
28
|
+
for step in data.get("steps", []):
|
|
29
|
+
for tc in step.get("tool_calls") or []:
|
|
30
|
+
calls.append(tc)
|
|
31
|
+
return calls
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Criterion: check that a command's stdout contains a given string."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from rewardkit.criteria._command import run_command
|
|
6
|
+
from rewardkit.session import criterion
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@criterion(description="Check that the stdout of `{cmd}` contains '{text}'")
|
|
10
|
+
def command_output_contains(
|
|
11
|
+
workspace: Path,
|
|
12
|
+
cmd: str,
|
|
13
|
+
text: str,
|
|
14
|
+
cwd: str | None = None,
|
|
15
|
+
timeout: int = 30,
|
|
16
|
+
) -> bool:
|
|
17
|
+
result = run_command(workspace, cmd, cwd=cwd, timeout=timeout)
|
|
18
|
+
return result is not None and text in result.stdout
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Criterion: check that a command's stdout exactly matches a string."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from rewardkit.criteria._command import run_command
|
|
6
|
+
from rewardkit.session import criterion
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@criterion(description="Check that output of `{cmd}` matches {expected!r}")
|
|
10
|
+
def command_output_matches(
|
|
11
|
+
workspace: Path,
|
|
12
|
+
cmd: str,
|
|
13
|
+
expected: str,
|
|
14
|
+
cwd: str | None = None,
|
|
15
|
+
timeout: int = 30,
|
|
16
|
+
) -> bool:
|
|
17
|
+
result = run_command(workspace, cmd, cwd=cwd, timeout=timeout)
|
|
18
|
+
return result is not None and result.stdout.strip() == expected.strip()
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Criterion: check that a command's stdout matches a regex pattern."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from rewardkit.criteria._command import run_command
|
|
7
|
+
from rewardkit.session import criterion
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@criterion(description="Check that output of `{cmd}` matches regex '{pattern}'")
|
|
11
|
+
def command_output_matches_regex(
|
|
12
|
+
workspace: Path,
|
|
13
|
+
cmd: str,
|
|
14
|
+
pattern: str,
|
|
15
|
+
cwd: str | None = None,
|
|
16
|
+
timeout: int = 30,
|
|
17
|
+
) -> bool:
|
|
18
|
+
result = run_command(workspace, cmd, cwd=cwd, timeout=timeout)
|
|
19
|
+
return result is not None and re.search(pattern, result.stdout) is not None
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Criterion: check that a shell command exits with code 0."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from rewardkit.criteria._command import run_command
|
|
6
|
+
from rewardkit.session import criterion
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@criterion(description="Check that the command `{cmd}` exits with code 0")
|
|
10
|
+
def command_succeeds(
|
|
11
|
+
workspace: Path,
|
|
12
|
+
cmd: str,
|
|
13
|
+
cwd: str | None = None,
|
|
14
|
+
timeout: int = 30,
|
|
15
|
+
) -> bool:
|
|
16
|
+
result = run_command(workspace, cmd, cwd=cwd, timeout=timeout)
|
|
17
|
+
return result is not None and result.returncode == 0
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Criterion: check that a specific CSV cell has the expected value."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from rewardkit.session import criterion
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@criterion(description="Check that {path}[{row},{col}] == {expected!r}")
|
|
10
|
+
def csv_cell_equals(
|
|
11
|
+
workspace: Path,
|
|
12
|
+
path: str,
|
|
13
|
+
row: int,
|
|
14
|
+
col: int | str,
|
|
15
|
+
expected: str,
|
|
16
|
+
) -> bool:
|
|
17
|
+
try:
|
|
18
|
+
with (workspace / path).open(newline="") as f:
|
|
19
|
+
if isinstance(col, str):
|
|
20
|
+
reader = csv.DictReader(f)
|
|
21
|
+
for i, r in enumerate(reader):
|
|
22
|
+
if i == row:
|
|
23
|
+
return str(r.get(col, "")) == expected
|
|
24
|
+
return False
|
|
25
|
+
else:
|
|
26
|
+
for i, r in enumerate(csv.reader(f)):
|
|
27
|
+
if i == row:
|
|
28
|
+
if col < len(r):
|
|
29
|
+
return r[col] == expected
|
|
30
|
+
return False
|
|
31
|
+
return False
|
|
32
|
+
except (FileNotFoundError, OSError, csv.Error, KeyError, IndexError):
|
|
33
|
+
return False
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Criterion: fuzzy text comparison returning a similarity ratio [0, 1]."""
|
|
2
|
+
|
|
3
|
+
from difflib import SequenceMatcher
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from rewardkit.session import criterion
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@criterion(description="Similarity ratio for {path}")
|
|
10
|
+
def diff_ratio(workspace: Path, path: str, expected: str) -> float:
|
|
11
|
+
try:
|
|
12
|
+
content = (workspace / path).read_text()
|
|
13
|
+
except (FileNotFoundError, OSError):
|
|
14
|
+
return 0.0
|
|
15
|
+
return SequenceMatcher(None, content.strip(), expected.strip()).ratio()
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Criterion: check that a file contains a given string."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from rewardkit.session import criterion
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@criterion(description="Check that {path} contains the text '{text}'")
|
|
9
|
+
def file_contains(workspace: Path, path: str, text: str) -> bool:
|
|
10
|
+
try:
|
|
11
|
+
return text in (workspace / path).read_text()
|
|
12
|
+
except (FileNotFoundError, OSError):
|
|
13
|
+
return False
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Criterion: check that a file contains text matching a regex pattern."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from rewardkit.session import criterion
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@criterion(description="Check that {path} contains text matching regex '{pattern}'")
|
|
10
|
+
def file_contains_regex(workspace: Path, path: str, pattern: str) -> bool:
|
|
11
|
+
try:
|
|
12
|
+
return re.search(pattern, (workspace / path).read_text()) is not None
|
|
13
|
+
except (FileNotFoundError, OSError):
|
|
14
|
+
return False
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Criterion: check that a file exists."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from rewardkit.session import criterion
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@criterion(description="Check that {path} exists in the workspace")
|
|
9
|
+
def file_exists(workspace: Path, path: str) -> bool:
|
|
10
|
+
return (workspace / path).exists()
|