harbor-rewardkit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. harbor_rewardkit-0.1.0/PKG-INFO +73 -0
  2. harbor_rewardkit-0.1.0/README.md +46 -0
  3. harbor_rewardkit-0.1.0/pyproject.toml +42 -0
  4. harbor_rewardkit-0.1.0/src/rewardkit/__init__.py +37 -0
  5. harbor_rewardkit-0.1.0/src/rewardkit/__main__.py +90 -0
  6. harbor_rewardkit-0.1.0/src/rewardkit/compare.py +91 -0
  7. harbor_rewardkit-0.1.0/src/rewardkit/criteria/__init__.py +63 -0
  8. harbor_rewardkit-0.1.0/src/rewardkit/criteria/_command.py +30 -0
  9. harbor_rewardkit-0.1.0/src/rewardkit/criteria/_trajectory.py +31 -0
  10. harbor_rewardkit-0.1.0/src/rewardkit/criteria/command_output_contains.py +18 -0
  11. harbor_rewardkit-0.1.0/src/rewardkit/criteria/command_output_matches.py +18 -0
  12. harbor_rewardkit-0.1.0/src/rewardkit/criteria/command_output_matches_regex.py +19 -0
  13. harbor_rewardkit-0.1.0/src/rewardkit/criteria/command_succeeds.py +17 -0
  14. harbor_rewardkit-0.1.0/src/rewardkit/criteria/csv_cell_equals.py +33 -0
  15. harbor_rewardkit-0.1.0/src/rewardkit/criteria/diff_ratio.py +15 -0
  16. harbor_rewardkit-0.1.0/src/rewardkit/criteria/file_contains.py +13 -0
  17. harbor_rewardkit-0.1.0/src/rewardkit/criteria/file_contains_regex.py +14 -0
  18. harbor_rewardkit-0.1.0/src/rewardkit/criteria/file_exists.py +10 -0
  19. harbor_rewardkit-0.1.0/src/rewardkit/criteria/file_matches.py +15 -0
  20. harbor_rewardkit-0.1.0/src/rewardkit/criteria/file_not_exists.py +10 -0
  21. harbor_rewardkit-0.1.0/src/rewardkit/criteria/files_equal.py +15 -0
  22. harbor_rewardkit-0.1.0/src/rewardkit/criteria/http_response_contains.py +24 -0
  23. harbor_rewardkit-0.1.0/src/rewardkit/criteria/http_status_equals.py +26 -0
  24. harbor_rewardkit-0.1.0/src/rewardkit/criteria/image_similarity.py +42 -0
  25. harbor_rewardkit-0.1.0/src/rewardkit/criteria/image_size_equals.py +21 -0
  26. harbor_rewardkit-0.1.0/src/rewardkit/criteria/json_key_equals.py +17 -0
  27. harbor_rewardkit-0.1.0/src/rewardkit/criteria/json_path_equals.py +24 -0
  28. harbor_rewardkit-0.1.0/src/rewardkit/criteria/sqlite_query_equals.py +21 -0
  29. harbor_rewardkit-0.1.0/src/rewardkit/criteria/trajectory_tool_not_used.py +19 -0
  30. harbor_rewardkit-0.1.0/src/rewardkit/criteria/trajectory_tool_used.py +21 -0
  31. harbor_rewardkit-0.1.0/src/rewardkit/criteria/trajectory_turn_count.py +24 -0
  32. harbor_rewardkit-0.1.0/src/rewardkit/criteria/xlsx_cell_equals.py +28 -0
  33. harbor_rewardkit-0.1.0/src/rewardkit/isolation.py +126 -0
  34. harbor_rewardkit-0.1.0/src/rewardkit/judges.py +354 -0
  35. harbor_rewardkit-0.1.0/src/rewardkit/models.py +131 -0
  36. harbor_rewardkit-0.1.0/src/rewardkit/prompts/agent.md +3 -0
  37. harbor_rewardkit-0.1.0/src/rewardkit/prompts/llm.md +3 -0
  38. harbor_rewardkit-0.1.0/src/rewardkit/prompts/llm_trajectory.md +3 -0
  39. harbor_rewardkit-0.1.0/src/rewardkit/reward.py +208 -0
  40. harbor_rewardkit-0.1.0/src/rewardkit/runner.py +409 -0
  41. harbor_rewardkit-0.1.0/src/rewardkit/session.py +137 -0
  42. harbor_rewardkit-0.1.0/src/rewardkit/trajectory.py +132 -0
@@ -0,0 +1,73 @@
1
+ Metadata-Version: 2.4
2
+ Name: harbor-rewardkit
3
+ Version: 0.1.0
4
+ Summary: Lightweight grading toolkit for environment-based tasks.
5
+ Keywords: grading,evaluation,rewards,llm,agents,benchmarks
6
+ Author: benediktstroebl
7
+ License-Expression: Apache-2.0
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Topic :: Software Development :: Testing
15
+ Requires-Dist: litellm>=1.83
16
+ Requires-Dist: openpyxl>=3.1 ; extra == 'all'
17
+ Requires-Dist: pillow>=10.0 ; extra == 'all'
18
+ Requires-Dist: pillow>=10.0 ; extra == 'image'
19
+ Requires-Dist: openpyxl>=3.1 ; extra == 'office'
20
+ Requires-Python: >=3.12
21
+ Project-URL: Repository, https://github.com/harbor-framework/harbor
22
+ Project-URL: Issues, https://github.com/harbor-framework/harbor/issues
23
+ Provides-Extra: all
24
+ Provides-Extra: image
25
+ Provides-Extra: office
26
+ Description-Content-Type: text/markdown
27
+
28
+ # rewardkit
29
+
30
+ [![](https://dcbadge.limes.pink/api/server/https://discord.gg/6xWPKhGDbA)](https://discord.gg/6xWPKhGDbA)
31
+ [![Docs](https://img.shields.io/badge/Docs-000000?style=for-the-badge&logo=mdbook&color=105864)](https://harborframework.com/docs/rewardkit)
32
+
33
+ A lightweight grading toolkit for defining and running verifiers that output structured reward scores.
34
+
35
+ ## Installation
36
+
37
+ ```bash
38
+ uv tool install rewardkit
39
+ ```
40
+
41
+ ## Example: Programmatic criteria
42
+
43
+ ```python
44
+ # tests/check.py
45
+ from rewardkit import criteria
46
+
47
+ criteria.file_exists("output.txt")
48
+ criteria.file_contains("output.txt", "hello")
49
+ ```
50
+
51
+ ## Example: LLM judge
52
+
53
+ ```toml
54
+ # tests/quality.toml
55
+ [judge]
56
+ judge = "anthropic/claude-sonnet-4-6"
57
+ files = ["/app/main.py"]
58
+
59
+ [[criterion]]
60
+ description = "Is the code correct?"
61
+ type = "binary"
62
+ ```
63
+
64
+ ## Usage
65
+
66
+ Add rewardkit to your `test.sh` file:
67
+
68
+ ```bash
69
+ # tests/test.sh
70
+ uvx rewardkit /tests
71
+ ```
72
+
73
+ See the [documentation](https://harborframework.com/docs/rewardkit) and a full [working example](https://github.com/harbor-framework/harbor/tree/main/examples/tasks/reward-kit-example).
@@ -0,0 +1,46 @@
1
+ # rewardkit
2
+
3
+ [![](https://dcbadge.limes.pink/api/server/https://discord.gg/6xWPKhGDbA)](https://discord.gg/6xWPKhGDbA)
4
+ [![Docs](https://img.shields.io/badge/Docs-000000?style=for-the-badge&logo=mdbook&color=105864)](https://harborframework.com/docs/rewardkit)
5
+
6
+ A lightweight grading toolkit for defining and running verifiers that output structured reward scores.
7
+
8
+ ## Installation
9
+
10
+ ```bash
11
+ uv tool install rewardkit
12
+ ```
13
+
14
+ ## Example: Programmatic criteria
15
+
16
+ ```python
17
+ # tests/check.py
18
+ from rewardkit import criteria
19
+
20
+ criteria.file_exists("output.txt")
21
+ criteria.file_contains("output.txt", "hello")
22
+ ```
23
+
24
+ ## Example: LLM judge
25
+
26
+ ```toml
27
+ # tests/quality.toml
28
+ [judge]
29
+ judge = "anthropic/claude-sonnet-4-6"
30
+ files = ["/app/main.py"]
31
+
32
+ [[criterion]]
33
+ description = "Is the code correct?"
34
+ type = "binary"
35
+ ```
36
+
37
+ ## Usage
38
+
39
+ Add rewardkit to your `test.sh` file:
40
+
41
+ ```bash
42
+ # tests/test.sh
43
+ uvx rewardkit /tests
44
+ ```
45
+
46
+ See the [documentation](https://harborframework.com/docs/rewardkit) and a full [working example](https://github.com/harbor-framework/harbor/tree/main/examples/tasks/reward-kit-example).
@@ -0,0 +1,42 @@
1
+ [project]
2
+ name = "harbor-rewardkit"
3
+ version = "0.1.0"
4
+ description = "Lightweight grading toolkit for environment-based tasks."
5
+ readme = "README.md"
6
+ license = "Apache-2.0"
7
+ authors = [
8
+ { name = "benediktstroebl" },
9
+ ]
10
+ requires-python = ">=3.12"
11
+ keywords = ["grading", "evaluation", "rewards", "llm", "agents", "benchmarks"]
12
+ classifiers = [
13
+ "Development Status :: 4 - Beta",
14
+ "Intended Audience :: Developers",
15
+ "License :: OSI Approved :: Apache Software License",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.12",
18
+ "Programming Language :: Python :: 3.13",
19
+ "Topic :: Software Development :: Testing",
20
+ ]
21
+ dependencies = [
22
+ "litellm>=1.83",
23
+ ]
24
+
25
+ [project.optional-dependencies]
26
+ office = ["openpyxl>=3.1"]
27
+ image = ["Pillow>=10.0"]
28
+ all = ["openpyxl>=3.1", "Pillow>=10.0"]
29
+
30
+ [project.urls]
31
+ Repository = "https://github.com/harbor-framework/harbor"
32
+ Issues = "https://github.com/harbor-framework/harbor/issues"
33
+
34
+ [project.scripts]
35
+ rewardkit = "rewardkit.__main__:main"
36
+
37
+ [build-system]
38
+ requires = ["uv_build>=0.10.8,<0.11.0"]
39
+ build-backend = "uv_build"
40
+
41
+ [tool.uv.build-backend]
42
+ module-name = "rewardkit"
@@ -0,0 +1,37 @@
1
+ from rewardkit.compare import ComparisonResult, compare, format_comparison
2
+ from rewardkit.models import (
3
+ Aggregation,
4
+ AgentJudge,
5
+ Binary,
6
+ Criterion,
7
+ LLMJudge,
8
+ Likert,
9
+ Numeric,
10
+ OutputFormat,
11
+ Score,
12
+ )
13
+ from rewardkit.reward import Reward
14
+ from rewardkit.runner import discover, run, run_multi
15
+ from rewardkit.session import criterion
16
+ from rewardkit.trajectory import format_trajectory
17
+
18
+ __all__ = [
19
+ "Aggregation",
20
+ "AgentJudge",
21
+ "Binary",
22
+ "ComparisonResult",
23
+ "Criterion",
24
+ "LLMJudge",
25
+ "Likert",
26
+ "Numeric",
27
+ "OutputFormat",
28
+ "Reward",
29
+ "Score",
30
+ "compare",
31
+ "criterion",
32
+ "discover",
33
+ "format_comparison",
34
+ "format_trajectory",
35
+ "run",
36
+ "run_multi",
37
+ ]
@@ -0,0 +1,90 @@
1
+ """CLI entry point: ``python -m rewardkit``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+
7
+ from rewardkit.runner import run, run_multi
8
+
9
+
10
+ def main() -> None:
11
+ parser = argparse.ArgumentParser(
12
+ prog="rewardkit",
13
+ description="Discover and run folder-based rewards.",
14
+ )
15
+ parser.add_argument(
16
+ "tests_dirs",
17
+ nargs="+",
18
+ help="Path(s) to tests directory. Multiple dirs run independently and are compared.",
19
+ )
20
+ parser.add_argument(
21
+ "--workspace", default="/app", help="Workspace path (default: /app)"
22
+ )
23
+ parser.add_argument(
24
+ "--output",
25
+ default="/logs/verifier/reward.json",
26
+ help="Output JSON path (default: /logs/verifier/reward.json)",
27
+ )
28
+ parser.add_argument(
29
+ "--max-concurrent-programmatic",
30
+ "--mcprog",
31
+ "--mcp",
32
+ type=int,
33
+ default=8,
34
+ help="Max programmatic rewards to run in parallel (0 = unlimited, default: 8)",
35
+ )
36
+ parser.add_argument(
37
+ "--max-concurrent-llm",
38
+ "--mcllm",
39
+ "--mcl",
40
+ type=int,
41
+ default=8,
42
+ help="Max LLM judge calls to run in parallel (0 = unlimited, default: 8)",
43
+ )
44
+ parser.add_argument(
45
+ "--max-concurrent-agent",
46
+ "--mcagent",
47
+ "--mca",
48
+ type=int,
49
+ default=2,
50
+ help="Max agent judge calls to run in parallel (0 = unlimited, default: 2)",
51
+ )
52
+
53
+ args = parser.parse_args()
54
+ concurrency_kwargs = dict(
55
+ max_concurrent_programmatic=args.max_concurrent_programmatic,
56
+ max_concurrent_llm=args.max_concurrent_llm,
57
+ max_concurrent_agent=args.max_concurrent_agent,
58
+ )
59
+
60
+ if len(args.tests_dirs) == 1:
61
+ result = run(
62
+ args.tests_dirs[0],
63
+ workspace=args.workspace,
64
+ output=args.output,
65
+ **concurrency_kwargs,
66
+ )
67
+ for name, score in result.items():
68
+ print(f"{name}: {score}")
69
+ else:
70
+ # Multiple test suites: run independently, compare
71
+ per_dir = run_multi(
72
+ args.tests_dirs,
73
+ workspace=args.workspace,
74
+ output=args.output,
75
+ **concurrency_kwargs,
76
+ )
77
+ for label, scores in per_dir.items():
78
+ for name, score in scores.items():
79
+ print(f"{label}/{name}: {score}")
80
+
81
+ from rewardkit.compare import format_comparison
82
+
83
+ table = format_comparison(per_dir)
84
+ if table:
85
+ print()
86
+ print(table)
87
+
88
+
89
+ if __name__ == "__main__":
90
+ main()
@@ -0,0 +1,91 @@
1
+ """Comparison utilities for multi-dir reward results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field
6
+
7
+
8
+ class ComparisonResult(BaseModel):
9
+ """Comparison of reward scores across multiple test directories."""
10
+
11
+ model_config = ConfigDict(frozen=True)
12
+
13
+ labels: list[str]
14
+ per_reward: dict[str, dict[str, float]] = Field(default_factory=dict)
15
+
16
+
17
+ def compare(
18
+ results: dict[str, dict[str, float]],
19
+ ) -> ComparisonResult:
20
+ """Compare results from multiple test directories.
21
+
22
+ Args:
23
+ results: Mapping of ``dir_label -> {reward_name: score}``.
24
+
25
+ Returns:
26
+ A :class:`ComparisonResult` with overlapping reward names mapped
27
+ to per-label scores.
28
+ """
29
+ labels = list(results.keys())
30
+ if len(labels) < 2:
31
+ return ComparisonResult(labels=labels)
32
+
33
+ all_names: set[str] = set()
34
+ for scores in results.values():
35
+ all_names.update(scores.keys())
36
+
37
+ per_reward: dict[str, dict[str, float]] = {}
38
+ for name in sorted(all_names):
39
+ entry: dict[str, float] = {}
40
+ for label in labels:
41
+ score = results[label].get(name)
42
+ if score is not None:
43
+ entry[label] = score
44
+ if len(entry) >= 2:
45
+ per_reward[name] = entry
46
+
47
+ return ComparisonResult(labels=labels, per_reward=per_reward)
48
+
49
+
50
+ def format_comparison(results: dict[str, dict[str, float]]) -> str:
51
+ """Format a comparison table for printing to stdout.
52
+
53
+ Returns an empty string if there are fewer than 2 dirs or no overlapping
54
+ reward names.
55
+ """
56
+ cr = compare(results)
57
+ if not cr.per_reward:
58
+ return ""
59
+
60
+ labels = cr.labels
61
+ name_width = max(len("reward"), max(len(n) for n in cr.per_reward))
62
+ col_widths = {label: max(len(label), 6) for label in labels}
63
+
64
+ header = "reward".ljust(name_width)
65
+ for label in labels:
66
+ header += " " + label.rjust(col_widths[label])
67
+ header += " " + "diff".rjust(6)
68
+
69
+ sep = "-" * len(header)
70
+ lines = ["Comparison:", sep, header, sep]
71
+
72
+ for name, scores in cr.per_reward.items():
73
+ row = name.ljust(name_width)
74
+ values = []
75
+ for label in labels:
76
+ val = scores.get(label)
77
+ if val is not None:
78
+ row += " " + f"{val:.4f}".rjust(col_widths[label])
79
+ values.append(val)
80
+ else:
81
+ row += " " + "-".rjust(col_widths[label])
82
+ if len(values) >= 2:
83
+ diff = values[0] - values[-1]
84
+ sign = "+" if diff > 0 else ""
85
+ row += " " + f"{sign}{diff:.4f}".rjust(6)
86
+ else:
87
+ row += " " + "-".rjust(6)
88
+ lines.append(row)
89
+
90
+ lines.append(sep)
91
+ return "\n".join(lines)
@@ -0,0 +1,63 @@
1
+ """Criterion functions for rewardkit.
2
+
3
+ All criteria — built-in and user-defined — are resolved via the global
4
+ ``_factory_registry`` in :mod:`rewardkit.session`. User-defined criteria
5
+ registered with ``@criterion`` override built-ins of the same name.
6
+ """
7
+
8
+ import importlib as _importlib
9
+ import sys as _sys
10
+
11
+ from rewardkit.session import _builtin_names, _factory_registry
12
+
13
+ # Import built-in criterion modules so their @criterion decorators execute
14
+ # and populate _factory_registry.
15
+ _BUILTIN_MODULES = [
16
+ "command_output_contains",
17
+ "command_output_matches",
18
+ "command_output_matches_regex",
19
+ "command_succeeds",
20
+ "csv_cell_equals",
21
+ "diff_ratio",
22
+ "file_contains",
23
+ "file_contains_regex",
24
+ "file_exists",
25
+ "file_matches",
26
+ "file_not_exists",
27
+ "files_equal",
28
+ "http_response_contains",
29
+ "http_status_equals",
30
+ "image_similarity",
31
+ "image_size_equals",
32
+ "json_key_equals",
33
+ "json_path_equals",
34
+ "sqlite_query_equals",
35
+ "trajectory_tool_not_used",
36
+ "trajectory_tool_used",
37
+ "trajectory_turn_count",
38
+ "xlsx_cell_equals",
39
+ ]
40
+
41
+ for _name in _BUILTIN_MODULES:
42
+ _importlib.import_module(f"rewardkit.criteria.{_name}")
43
+
44
+ # Mark currently registered names as built-in so user overrides produce a warning.
45
+ _builtin_names.update(_factory_registry)
46
+
47
+ # Python sets submodule attributes on the parent package.
48
+ # Remove them so all lookups go through __getattr__, which checks
49
+ # _factory_registry — this lets user-defined criteria override built-ins.
50
+ _this = _sys.modules[__name__]
51
+ for _name in _BUILTIN_MODULES:
52
+ delattr(_this, _name)
53
+
54
+ del _name, _this, _importlib, _sys
55
+
56
+ __all__ = list(_BUILTIN_MODULES)
57
+
58
+
59
+ def __getattr__(name: str): # noqa: ANN204
60
+ """Resolve criteria from the global factory registry."""
61
+ if name in _factory_registry:
62
+ return _factory_registry[name]
63
+ raise AttributeError(f"module 'rewardkit.criteria' has no attribute {name!r}")
@@ -0,0 +1,30 @@
1
+ """Shared subprocess helper for command-based criteria."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import subprocess
6
+ from pathlib import Path
7
+
8
+
9
+ def run_command(
10
+ workspace: Path,
11
+ cmd: str,
12
+ cwd: str | None = None,
13
+ timeout: int = 30,
14
+ ) -> subprocess.CompletedProcess[str] | None:
15
+ """Run a shell command in the workspace directory.
16
+
17
+ Returns the CompletedProcess on success, or None on timeout.
18
+ """
19
+ run_cwd = str(workspace / cwd) if cwd else str(workspace)
20
+ try:
21
+ return subprocess.run(
22
+ cmd,
23
+ shell=True,
24
+ cwd=run_cwd,
25
+ capture_output=True,
26
+ text=True,
27
+ timeout=timeout,
28
+ )
29
+ except subprocess.TimeoutExpired:
30
+ return None
@@ -0,0 +1,31 @@
1
+ """Shared helpers for trajectory-based criteria."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+
8
+
9
+ def load_trajectory(path: str | Path) -> dict | None:
10
+ """Load an ATIF trajectory JSON file. Returns None on error."""
11
+ p = Path(path)
12
+ if not p.exists():
13
+ return None
14
+ try:
15
+ return json.loads(p.read_text())
16
+ except (json.JSONDecodeError, OSError):
17
+ return None
18
+
19
+
20
+ def count_agent_turns(data: dict) -> int:
21
+ """Count the number of steps with source == 'agent'."""
22
+ return sum(1 for s in data.get("steps", []) if s.get("source") == "agent")
23
+
24
+
25
+ def collect_tool_calls(data: dict) -> list[dict]:
26
+ """Collect all tool calls across all steps."""
27
+ calls: list[dict] = []
28
+ for step in data.get("steps", []):
29
+ for tc in step.get("tool_calls") or []:
30
+ calls.append(tc)
31
+ return calls
@@ -0,0 +1,18 @@
1
+ """Criterion: check that a command's stdout contains a given string."""
2
+
3
+ from pathlib import Path
4
+
5
+ from rewardkit.criteria._command import run_command
6
+ from rewardkit.session import criterion
7
+
8
+
9
+ @criterion(description="Check that the stdout of `{cmd}` contains '{text}'")
10
+ def command_output_contains(
11
+ workspace: Path,
12
+ cmd: str,
13
+ text: str,
14
+ cwd: str | None = None,
15
+ timeout: int = 30,
16
+ ) -> bool:
17
+ result = run_command(workspace, cmd, cwd=cwd, timeout=timeout)
18
+ return result is not None and text in result.stdout
@@ -0,0 +1,18 @@
1
+ """Criterion: check that a command's stdout exactly matches a string."""
2
+
3
+ from pathlib import Path
4
+
5
+ from rewardkit.criteria._command import run_command
6
+ from rewardkit.session import criterion
7
+
8
+
9
+ @criterion(description="Check that output of `{cmd}` matches {expected!r}")
10
+ def command_output_matches(
11
+ workspace: Path,
12
+ cmd: str,
13
+ expected: str,
14
+ cwd: str | None = None,
15
+ timeout: int = 30,
16
+ ) -> bool:
17
+ result = run_command(workspace, cmd, cwd=cwd, timeout=timeout)
18
+ return result is not None and result.stdout.strip() == expected.strip()
@@ -0,0 +1,19 @@
1
+ """Criterion: check that a command's stdout matches a regex pattern."""
2
+
3
+ import re
4
+ from pathlib import Path
5
+
6
+ from rewardkit.criteria._command import run_command
7
+ from rewardkit.session import criterion
8
+
9
+
10
+ @criterion(description="Check that output of `{cmd}` matches regex '{pattern}'")
11
+ def command_output_matches_regex(
12
+ workspace: Path,
13
+ cmd: str,
14
+ pattern: str,
15
+ cwd: str | None = None,
16
+ timeout: int = 30,
17
+ ) -> bool:
18
+ result = run_command(workspace, cmd, cwd=cwd, timeout=timeout)
19
+ return result is not None and re.search(pattern, result.stdout) is not None
@@ -0,0 +1,17 @@
1
+ """Criterion: check that a shell command exits with code 0."""
2
+
3
+ from pathlib import Path
4
+
5
+ from rewardkit.criteria._command import run_command
6
+ from rewardkit.session import criterion
7
+
8
+
9
+ @criterion(description="Check that the command `{cmd}` exits with code 0")
10
+ def command_succeeds(
11
+ workspace: Path,
12
+ cmd: str,
13
+ cwd: str | None = None,
14
+ timeout: int = 30,
15
+ ) -> bool:
16
+ result = run_command(workspace, cmd, cwd=cwd, timeout=timeout)
17
+ return result is not None and result.returncode == 0
@@ -0,0 +1,33 @@
1
+ """Criterion: check that a specific CSV cell has the expected value."""
2
+
3
+ import csv
4
+ from pathlib import Path
5
+
6
+ from rewardkit.session import criterion
7
+
8
+
9
+ @criterion(description="Check that {path}[{row},{col}] == {expected!r}")
10
+ def csv_cell_equals(
11
+ workspace: Path,
12
+ path: str,
13
+ row: int,
14
+ col: int | str,
15
+ expected: str,
16
+ ) -> bool:
17
+ try:
18
+ with (workspace / path).open(newline="") as f:
19
+ if isinstance(col, str):
20
+ reader = csv.DictReader(f)
21
+ for i, r in enumerate(reader):
22
+ if i == row:
23
+ return str(r.get(col, "")) == expected
24
+ return False
25
+ else:
26
+ for i, r in enumerate(csv.reader(f)):
27
+ if i == row:
28
+ if col < len(r):
29
+ return r[col] == expected
30
+ return False
31
+ return False
32
+ except (FileNotFoundError, OSError, csv.Error, KeyError, IndexError):
33
+ return False
@@ -0,0 +1,15 @@
1
+ """Criterion: fuzzy text comparison returning a similarity ratio [0, 1]."""
2
+
3
+ from difflib import SequenceMatcher
4
+ from pathlib import Path
5
+
6
+ from rewardkit.session import criterion
7
+
8
+
9
+ @criterion(description="Similarity ratio for {path}")
10
+ def diff_ratio(workspace: Path, path: str, expected: str) -> float:
11
+ try:
12
+ content = (workspace / path).read_text()
13
+ except (FileNotFoundError, OSError):
14
+ return 0.0
15
+ return SequenceMatcher(None, content.strip(), expected.strip()).ratio()
@@ -0,0 +1,13 @@
1
+ """Criterion: check that a file contains a given string."""
2
+
3
+ from pathlib import Path
4
+
5
+ from rewardkit.session import criterion
6
+
7
+
8
+ @criterion(description="Check that {path} contains the text '{text}'")
9
+ def file_contains(workspace: Path, path: str, text: str) -> bool:
10
+ try:
11
+ return text in (workspace / path).read_text()
12
+ except (FileNotFoundError, OSError):
13
+ return False
@@ -0,0 +1,14 @@
1
+ """Criterion: check that a file contains text matching a regex pattern."""
2
+
3
+ import re
4
+ from pathlib import Path
5
+
6
+ from rewardkit.session import criterion
7
+
8
+
9
+ @criterion(description="Check that {path} contains text matching regex '{pattern}'")
10
+ def file_contains_regex(workspace: Path, path: str, pattern: str) -> bool:
11
+ try:
12
+ return re.search(pattern, (workspace / path).read_text()) is not None
13
+ except (FileNotFoundError, OSError):
14
+ return False
@@ -0,0 +1,10 @@
1
+ """Criterion: check that a file exists."""
2
+
3
+ from pathlib import Path
4
+
5
+ from rewardkit.session import criterion
6
+
7
+
8
+ @criterion(description="Check that {path} exists in the workspace")
9
+ def file_exists(workspace: Path, path: str) -> bool:
10
+ return (workspace / path).exists()