harborforge 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- harborforge/__init__.py +6 -0
- harborforge/handlers/__init__.py +19 -0
- harborforge/handlers/base.py +231 -0
- harborforge/mapper.py +142 -0
- harborforge/registry.py +61 -0
- harborforge/run.py +117 -0
- harborforge-1.0.0.dist-info/METADATA +8 -0
- harborforge-1.0.0.dist-info/RECORD +9 -0
- harborforge-1.0.0.dist-info/WHEEL +4 -0
harborforge/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from .base import (
|
|
2
|
+
DatasetHandler,
|
|
3
|
+
case_insensitive_verifier,
|
|
4
|
+
exact_match_verifier,
|
|
5
|
+
list_verifier,
|
|
6
|
+
no_verifier,
|
|
7
|
+
numeric_verifier,
|
|
8
|
+
script_verifier,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"DatasetHandler",
|
|
13
|
+
"case_insensitive_verifier",
|
|
14
|
+
"exact_match_verifier",
|
|
15
|
+
"list_verifier",
|
|
16
|
+
"no_verifier",
|
|
17
|
+
"numeric_verifier",
|
|
18
|
+
"script_verifier",
|
|
19
|
+
]
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DatasetHandler contract.
|
|
3
|
+
|
|
4
|
+
Each dataset subclass must implement:
|
|
5
|
+
- instruction(task_data) → str content for instruction.md (shown to the agent, no answer leakage)
|
|
6
|
+
- test_sh(task_data) → str content for tests/test.sh (must write reward to /logs/verifier/reward.txt)
|
|
7
|
+
- dockerfile(task_data) → str content for environment/Dockerfile
|
|
8
|
+
|
|
9
|
+
Optional overrides:
|
|
10
|
+
- data_files(task_data) → list[tuple[Path, str]] (local_path, dest_in_build_context) pairs
|
|
11
|
+
- task_toml(task_id) → str content for task.toml
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from abc import ABC, abstractmethod
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DatasetHandler(ABC):
|
|
20
|
+
dataset_name: str
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def instruction(self, task_data: dict[str, Any]) -> str:
|
|
24
|
+
"""Returns instruction.md content — what the agent sees. Must not include the answer."""
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def test_sh(self, task_data: dict[str, Any]) -> str:
|
|
28
|
+
"""Returns tests/test.sh content. Must write a float reward to /logs/verifier/reward.txt."""
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def dockerfile(self, task_data: dict[str, Any]) -> str:
|
|
32
|
+
"""Returns environment/Dockerfile content."""
|
|
33
|
+
|
|
34
|
+
def verifier_dockerfile(self, task_data: dict[str, Any]) -> str | None:
|
|
35
|
+
"""Return a Dockerfile for a SEPARATE verifier container, or None for SHARED (default).
|
|
36
|
+
|
|
37
|
+
When non-None, Harbor routes the trial through SEPARATE mode: the verifier runs in
|
|
38
|
+
its own isolated container and receives agent artifacts as input. Use this when the
|
|
39
|
+
verifier needs different dependencies or ground truth data isolated from the agent.
|
|
40
|
+
"""
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
def setup(self) -> None:
|
|
44
|
+
"""Download or prepare local data required by this dataset. No-op by default."""
|
|
45
|
+
|
|
46
|
+
def artifacts(self) -> list[str]:
|
|
47
|
+
"""Container paths to capture as artifacts after a trial. Empty by default."""
|
|
48
|
+
return []
|
|
49
|
+
|
|
50
|
+
def verifier_env_keys(self) -> list[str]:
|
|
51
|
+
"""Environment variable keys to forward to the SEPARATE verifier container."""
|
|
52
|
+
return []
|
|
53
|
+
|
|
54
|
+
def data_files(self, task_data: dict[str, Any]) -> list[tuple[Path, str]]:
|
|
55
|
+
"""
|
|
56
|
+
Returns (local_path, dest_in_build_context) pairs for files to COPY into the image.
|
|
57
|
+
dest_in_build_context is relative to environment/ (e.g. 'data/train.csv').
|
|
58
|
+
"""
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
def task_toml(self, task_id: str) -> str:
|
|
62
|
+
return f"""\
|
|
63
|
+
version = "1.0"
|
|
64
|
+
|
|
65
|
+
[task]
|
|
66
|
+
name = "{task_id}"
|
|
67
|
+
|
|
68
|
+
[metadata]
|
|
69
|
+
|
|
70
|
+
[verifier]
|
|
71
|
+
timeout_sec = 900.0
|
|
72
|
+
|
|
73
|
+
[agent]
|
|
74
|
+
timeout_sec = 900.0
|
|
75
|
+
|
|
76
|
+
[environment]
|
|
77
|
+
build_timeout_sec = 600.0
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# Verifier shell script templates
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def exact_match_verifier(answer: Any) -> str:
|
|
87
|
+
"""Reward 1.0 if agent output matches expected answer (whitespace-stripped string comparison)."""
|
|
88
|
+
safe = str(answer).replace("'", "'\\''")
|
|
89
|
+
return f"""\
|
|
90
|
+
#!/bin/bash
|
|
91
|
+
set -e
|
|
92
|
+
mkdir -p /logs/verifier
|
|
93
|
+
|
|
94
|
+
EXPECTED='{safe}'
|
|
95
|
+
SUBMISSION=/app/submission.txt
|
|
96
|
+
|
|
97
|
+
if [ ! -f "$SUBMISSION" ]; then
|
|
98
|
+
echo "0" > /logs/verifier/reward.txt
|
|
99
|
+
exit 0
|
|
100
|
+
fi
|
|
101
|
+
|
|
102
|
+
ACTUAL=$(cat "$SUBMISSION" | tr -d '[:space:]')
|
|
103
|
+
EXPECTED_CLEAN=$(echo "$EXPECTED" | tr -d '[:space:]')
|
|
104
|
+
|
|
105
|
+
if [ "$ACTUAL" = "$EXPECTED_CLEAN" ]; then
|
|
106
|
+
echo "1" > /logs/verifier/reward.txt
|
|
107
|
+
else
|
|
108
|
+
echo "0" > /logs/verifier/reward.txt
|
|
109
|
+
fi
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def numeric_verifier(answer: Any, tolerance: float = 0.05) -> str:
|
|
114
|
+
"""Reward 1.0 if agent output is within `tolerance` (relative) of expected numeric value."""
|
|
115
|
+
safe = str(answer).replace("'", "'\\''")
|
|
116
|
+
return f"""\
|
|
117
|
+
#!/bin/bash
|
|
118
|
+
set -e
|
|
119
|
+
mkdir -p /logs/verifier
|
|
120
|
+
|
|
121
|
+
EXPECTED='{safe}'
|
|
122
|
+
|
|
123
|
+
python3 - "$EXPECTED" <<'PYEOF'
|
|
124
|
+
import sys
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
expected = float(sys.argv[1])
|
|
128
|
+
with open("/app/submission.txt") as f:
|
|
129
|
+
actual = float(f.read().strip())
|
|
130
|
+
denom = abs(expected) if expected != 0 else 1.0
|
|
131
|
+
reward = 1.0 if abs(actual - expected) / denom <= {tolerance} else 0.0
|
|
132
|
+
except Exception:
|
|
133
|
+
reward = 0.0
|
|
134
|
+
|
|
135
|
+
with open("/logs/verifier/reward.txt", "w") as f:
|
|
136
|
+
f.write(str(reward))
|
|
137
|
+
PYEOF
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def list_verifier(answer: Any) -> str:
|
|
142
|
+
"""Reward 1.0 if agent output contains all expected items (order-insensitive, case-insensitive).
|
|
143
|
+
Expected answer should be a Python list repr or comma-separated string."""
|
|
144
|
+
safe = str(answer).replace("'", "'\\''")
|
|
145
|
+
return f"""\
|
|
146
|
+
#!/bin/bash
|
|
147
|
+
set -e
|
|
148
|
+
mkdir -p /logs/verifier
|
|
149
|
+
|
|
150
|
+
python3 <<'PYEOF'
|
|
151
|
+
import ast, re
|
|
152
|
+
|
|
153
|
+
EXPECTED_RAW = '{safe}'
|
|
154
|
+
try:
|
|
155
|
+
with open("/app/submission.txt") as f:
|
|
156
|
+
actual_raw = f.read().strip()
|
|
157
|
+
except FileNotFoundError:
|
|
158
|
+
open("/logs/verifier/reward.txt", "w").write("0")
|
|
159
|
+
raise SystemExit
|
|
160
|
+
|
|
161
|
+
def parse_list(s):
|
|
162
|
+
try:
|
|
163
|
+
val = ast.literal_eval(s)
|
|
164
|
+
if isinstance(val, list):
|
|
165
|
+
return {{str(x).strip().lower() for x in val}}
|
|
166
|
+
except Exception:
|
|
167
|
+
pass
|
|
168
|
+
return {{x.strip().lower() for x in re.split(r"[,\\n]+", s) if x.strip()}}
|
|
169
|
+
|
|
170
|
+
expected = parse_list(EXPECTED_RAW)
|
|
171
|
+
actual = parse_list(actual_raw)
|
|
172
|
+
reward = 1.0 if expected and expected == actual else 0.0
|
|
173
|
+
open("/logs/verifier/reward.txt", "w").write(str(reward))
|
|
174
|
+
PYEOF
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def case_insensitive_verifier(answer: Any) -> str:
|
|
179
|
+
"""Reward 1.0 if agent output matches expected answer, ignoring case and whitespace."""
|
|
180
|
+
safe = str(answer).replace("'", "'\\''")
|
|
181
|
+
return f"""\
|
|
182
|
+
#!/bin/bash
|
|
183
|
+
set -e
|
|
184
|
+
mkdir -p /logs/verifier
|
|
185
|
+
|
|
186
|
+
EXPECTED='{safe}'
|
|
187
|
+
SUBMISSION=/app/submission.txt
|
|
188
|
+
|
|
189
|
+
if [ ! -f "$SUBMISSION" ]; then
|
|
190
|
+
echo "0" > /logs/verifier/reward.txt
|
|
191
|
+
exit 0
|
|
192
|
+
fi
|
|
193
|
+
|
|
194
|
+
ACTUAL=$(cat "$SUBMISSION" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]')
|
|
195
|
+
EXPECTED_CLEAN=$(echo "$EXPECTED" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]')
|
|
196
|
+
|
|
197
|
+
if [ "$ACTUAL" = "$EXPECTED_CLEAN" ]; then
|
|
198
|
+
echo "1" > /logs/verifier/reward.txt
|
|
199
|
+
else
|
|
200
|
+
echo "0" > /logs/verifier/reward.txt
|
|
201
|
+
fi
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def script_verifier(script_container_path: str) -> str:
|
|
206
|
+
"""Runs a Python eval script already present in the container.
|
|
207
|
+
The script is fully responsible for writing a float reward to /logs/verifier/reward.txt."""
|
|
208
|
+
return f"""\
|
|
209
|
+
#!/bin/bash
|
|
210
|
+
set -e
|
|
211
|
+
mkdir -p /logs/verifier
|
|
212
|
+
|
|
213
|
+
SCRIPT="{script_container_path}"
|
|
214
|
+
|
|
215
|
+
if [ ! -f "$SCRIPT" ]; then
|
|
216
|
+
echo "0" > /logs/verifier/reward.txt
|
|
217
|
+
exit 0
|
|
218
|
+
fi
|
|
219
|
+
|
|
220
|
+
python3 "$SCRIPT" || echo "0" > /logs/verifier/reward.txt
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def no_verifier(reason: str = "External evaluation required") -> str:
|
|
225
|
+
"""Always writes reward 0. Used for tasks that require external scoring (e.g. leaderboards)."""
|
|
226
|
+
return f"""\
|
|
227
|
+
#!/bin/bash
|
|
228
|
+
# {reason}
|
|
229
|
+
mkdir -p /logs/verifier
|
|
230
|
+
echo "0" > /logs/verifier/reward.txt
|
|
231
|
+
"""
|
harborforge/mapper.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DataMapper — abstract base for benchmark → Harbor task directory pipelines.
|
|
3
|
+
|
|
4
|
+
Subclass and implement `iter_tasks()`. Override `setup()` to handle data
|
|
5
|
+
downloads or prep. Call `run()` as the standard entry point, or `map()`
|
|
6
|
+
directly for more control.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import shutil
|
|
10
|
+
import stat
|
|
11
|
+
from abc import ABC, abstractmethod
|
|
12
|
+
from collections.abc import Iterator
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from .handlers.base import DatasetHandler
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DataMapper(ABC):
|
|
20
|
+
"""
|
|
21
|
+
Abstract mapper from a benchmark dataset to Harbor task directories.
|
|
22
|
+
|
|
23
|
+
Minimal adapter implementation:
|
|
24
|
+
1. Implement `iter_tasks()` to yield tasks.
|
|
25
|
+
2. Optionally override `setup()` to download/prepare raw data.
|
|
26
|
+
3. Call `run()` as the entry point — it handles setup, mapping, and registry.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
# Public API
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
def setup(self) -> None:
|
|
34
|
+
"""Download or prepare raw data before mapping. No-op by default."""
|
|
35
|
+
|
|
36
|
+
def map(self, output_dir: Path, registry_path: Path | None = None) -> int:
|
|
37
|
+
"""
|
|
38
|
+
Write all tasks to output_dir as Harbor task directories.
|
|
39
|
+
Wipes and recreates output_dir on each call.
|
|
40
|
+
If registry_path is given, auto-generates a Harbor registry.json.
|
|
41
|
+
Returns the number of tasks written.
|
|
42
|
+
"""
|
|
43
|
+
if output_dir.exists():
|
|
44
|
+
shutil.rmtree(output_dir)
|
|
45
|
+
|
|
46
|
+
total = 0
|
|
47
|
+
current_dataset = ""
|
|
48
|
+
for task_id, dir_name, handler, task_data in self.iter_tasks():
|
|
49
|
+
self._write_task(output_dir / dir_name, task_id, handler, task_data)
|
|
50
|
+
total += 1
|
|
51
|
+
dataset = dir_name.split("/")[0]
|
|
52
|
+
if dataset != current_dataset:
|
|
53
|
+
if current_dataset:
|
|
54
|
+
print()
|
|
55
|
+
current_dataset = dataset
|
|
56
|
+
print(f" ↳ {dataset}", end="", flush=True)
|
|
57
|
+
print(f"\r ↳ {dataset} ({total} written)", end="", flush=True)
|
|
58
|
+
if total:
|
|
59
|
+
print()
|
|
60
|
+
|
|
61
|
+
if registry_path is not None:
|
|
62
|
+
from harborforge.registry import generate_registry
|
|
63
|
+
|
|
64
|
+
counts = generate_registry(output_dir, registry_path)
|
|
65
|
+
n_datasets = len(counts)
|
|
66
|
+
n_tasks = sum(counts.values())
|
|
67
|
+
print(f"✅ {registry_path} — {n_tasks} tasks across {n_datasets} datasets")
|
|
68
|
+
|
|
69
|
+
return total
|
|
70
|
+
|
|
71
|
+
def run(self, output_dir: Path, registry_path: Path | None = None) -> None:
|
|
72
|
+
"""
|
|
73
|
+
Standard entry point: setup() → map() → optional registry.
|
|
74
|
+
Adapters call this from __main__.py.
|
|
75
|
+
"""
|
|
76
|
+
self.setup()
|
|
77
|
+
total = self.map(output_dir, registry_path)
|
|
78
|
+
print(f"📊 Total tasks written: {total}")
|
|
79
|
+
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
# Abstract
|
|
82
|
+
# ---------------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
@abstractmethod
|
|
85
|
+
def iter_tasks(self) -> Iterator[tuple[str, str, DatasetHandler, dict[str, Any]]]:
|
|
86
|
+
"""
|
|
87
|
+
Yield (task_id, dir_name, handler, raw_task_data) for each task.
|
|
88
|
+
|
|
89
|
+
- task_id: unique identifier used in task.toml name field
|
|
90
|
+
- dir_name: relative path under output_dir (e.g. 'daeval/0')
|
|
91
|
+
- handler: DatasetHandler instance for this dataset
|
|
92
|
+
- raw_task_data: raw dict from the source benchmark file
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
# Internal
|
|
97
|
+
# ---------------------------------------------------------------------------
|
|
98
|
+
|
|
99
|
+
def _write_task(
|
|
100
|
+
self,
|
|
101
|
+
task_dir: Path,
|
|
102
|
+
task_id: str,
|
|
103
|
+
handler: DatasetHandler,
|
|
104
|
+
task_data: dict[str, Any],
|
|
105
|
+
) -> None:
|
|
106
|
+
task_dir.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
|
|
108
|
+
(task_dir / "instruction.md").write_text(handler.instruction(task_data), encoding="utf-8")
|
|
109
|
+
|
|
110
|
+
env_dir = task_dir / "environment"
|
|
111
|
+
env_dir.mkdir(exist_ok=True)
|
|
112
|
+
|
|
113
|
+
for local_path, dest_name in handler.data_files(task_data):
|
|
114
|
+
dest = env_dir / dest_name
|
|
115
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
116
|
+
shutil.copy2(local_path, dest)
|
|
117
|
+
|
|
118
|
+
(env_dir / "Dockerfile").write_text(handler.dockerfile(task_data), encoding="utf-8")
|
|
119
|
+
|
|
120
|
+
# Auto-detect verifier mode: SEPARATE if handler provides a verifier Dockerfile.
|
|
121
|
+
# Harbor uses tests/ as the verifier build context in SEPARATE mode.
|
|
122
|
+
task_toml = handler.task_toml(task_id)
|
|
123
|
+
verifier_df = handler.verifier_dockerfile(task_data)
|
|
124
|
+
if verifier_df is not None:
|
|
125
|
+
task_toml += "\n[verifier.environment]\nbuild_timeout_sec = 300.0\n"
|
|
126
|
+
|
|
127
|
+
(task_dir / "task.toml").write_text(task_toml, encoding="utf-8")
|
|
128
|
+
|
|
129
|
+
tests_dir = task_dir / "tests"
|
|
130
|
+
tests_dir.mkdir(exist_ok=True)
|
|
131
|
+
test_sh = tests_dir / "test.sh"
|
|
132
|
+
test_sh.write_text(handler.test_sh(task_data), encoding="utf-8")
|
|
133
|
+
test_sh.chmod(test_sh.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
|
|
134
|
+
|
|
135
|
+
# SEPARATE mode: write verifier Dockerfile into tests/ (Harbor's verifier build context).
|
|
136
|
+
# COPY test.sh into the image so Harbor can execute /tests/test.sh inside the container.
|
|
137
|
+
if verifier_df is not None:
|
|
138
|
+
verifier_df_with_copy = (
|
|
139
|
+
verifier_df.rstrip()
|
|
140
|
+
+ "\nCOPY test.sh /tests/test.sh\nRUN chmod +x /tests/test.sh\n"
|
|
141
|
+
)
|
|
142
|
+
(tests_dir / "Dockerfile").write_text(verifier_df_with_copy, encoding="utf-8")
|
harborforge/registry.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generate a local Harbor registry.json from a directory of Harbor task directories.
|
|
3
|
+
|
|
4
|
+
The tasks directory is expected to have the structure:
|
|
5
|
+
tasks_dir/
|
|
6
|
+
dataset_a/
|
|
7
|
+
task_0/
|
|
8
|
+
task_1/
|
|
9
|
+
dataset_b/
|
|
10
|
+
task_0/
|
|
11
|
+
|
|
12
|
+
Each top-level subdirectory becomes a named dataset in the registry.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def generate_registry(
|
|
20
|
+
tasks_dir: Path,
|
|
21
|
+
output_path: Path,
|
|
22
|
+
version: str = "1.0",
|
|
23
|
+
description_template: str = "{name} benchmark tasks",
|
|
24
|
+
) -> dict[str, int]:
|
|
25
|
+
"""
|
|
26
|
+
Scan tasks_dir and write a Harbor registry.json to output_path.
|
|
27
|
+
|
|
28
|
+
Returns a dict of {dataset_name: task_count}.
|
|
29
|
+
"""
|
|
30
|
+
if not tasks_dir.exists():
|
|
31
|
+
raise FileNotFoundError(f"Tasks directory not found: {tasks_dir}")
|
|
32
|
+
|
|
33
|
+
datasets = []
|
|
34
|
+
counts: dict[str, int] = {}
|
|
35
|
+
|
|
36
|
+
for dataset_dir in sorted(tasks_dir.iterdir()):
|
|
37
|
+
if not dataset_dir.is_dir():
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
tasks = [
|
|
41
|
+
{"name": task.name, "path": str(task)}
|
|
42
|
+
for task in sorted(dataset_dir.iterdir())
|
|
43
|
+
if task.is_dir()
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
if not tasks:
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
name = dataset_dir.name.lower()
|
|
50
|
+
datasets.append(
|
|
51
|
+
{
|
|
52
|
+
"name": name,
|
|
53
|
+
"version": version,
|
|
54
|
+
"description": description_template.format(name=dataset_dir.name),
|
|
55
|
+
"tasks": tasks,
|
|
56
|
+
}
|
|
57
|
+
)
|
|
58
|
+
counts[name] = len(tasks)
|
|
59
|
+
|
|
60
|
+
output_path.write_text(json.dumps(datasets, indent=2))
|
|
61
|
+
return counts
|
harborforge/run.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generic Harbor trial runner.
|
|
3
|
+
|
|
4
|
+
Wraps `harbor jobs start` with handler-aware artifact/credential forwarding.
|
|
5
|
+
Adapters call `run()` with their registry, tasks dir, and defaults.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import os
|
|
10
|
+
import subprocess
|
|
11
|
+
import sys
|
|
12
|
+
import time
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _is_single_task(task: str) -> bool:
|
|
18
|
+
"""True when task is a specific task (dataset/id), False when it's a dataset name."""
|
|
19
|
+
return "/" in task
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def run(
|
|
23
|
+
handler_registry: dict[str, Any],
|
|
24
|
+
*,
|
|
25
|
+
tasks_dir: Path,
|
|
26
|
+
registry_path: Path,
|
|
27
|
+
job_name_prefix: str = "eval",
|
|
28
|
+
default_task: str,
|
|
29
|
+
default_model: str = "anthropic/claude-haiku-4-5-20251001",
|
|
30
|
+
) -> None:
|
|
31
|
+
"""
|
|
32
|
+
Parse CLI args and run a Harbor job against tasks from this adapter.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
handler_registry: dict mapping dataset_name → DatasetHandler
|
|
36
|
+
tasks_dir: local directory containing generated Harbor task dirs
|
|
37
|
+
registry_path: path to this adapter's registry.json
|
|
38
|
+
job_name_prefix: prefix for generated job names (e.g. "dsgym")
|
|
39
|
+
default_task: default task or dataset to run (e.g. "daeval/0")
|
|
40
|
+
default_model: default LLM model in provider/model format
|
|
41
|
+
"""
|
|
42
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"task", nargs="?", default=default_task, help="Task (dataset/id) or dataset name"
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"-m", "--model", default=default_model, help="LLM model (provider/model format)"
|
|
48
|
+
)
|
|
49
|
+
parser.add_argument("-k", "--n-attempts", default="1", help="Attempts per task (default: 1)")
|
|
50
|
+
parser.add_argument("-n", "--n-concurrent", default="4", help="Concurrent trials (default: 4)")
|
|
51
|
+
parser.add_argument("-l", "--n-tasks", help="Max tasks to run from dataset")
|
|
52
|
+
parser.add_argument("--max-turns", help="Max agent turns")
|
|
53
|
+
parser.add_argument("--force-build", action="store_true", help="Force Docker image rebuild")
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"--artifact",
|
|
56
|
+
action="append",
|
|
57
|
+
metavar="PATH",
|
|
58
|
+
help="Container path to capture as artifact (repeatable)",
|
|
59
|
+
)
|
|
60
|
+
args = parser.parse_args()
|
|
61
|
+
|
|
62
|
+
task_slug = args.task.replace("/", "__")
|
|
63
|
+
job_name = f"{job_name_prefix}__{task_slug}__{int(time.time()) % 100_000}"
|
|
64
|
+
harbor_bin = Path(sys.executable).parent / "harbor"
|
|
65
|
+
|
|
66
|
+
if _is_single_task(args.task):
|
|
67
|
+
task_path = tasks_dir / args.task
|
|
68
|
+
if not task_path.exists():
|
|
69
|
+
raise SystemExit(f"Task not found: {task_path}")
|
|
70
|
+
source_flags = ["-p", str(task_path)]
|
|
71
|
+
else:
|
|
72
|
+
if not registry_path.exists():
|
|
73
|
+
raise SystemExit(f"registry.json not found at {registry_path} — run 'just data' first")
|
|
74
|
+
source_flags = ["-d", args.task.lower(), "--registry-path", str(registry_path)]
|
|
75
|
+
|
|
76
|
+
cmd = [
|
|
77
|
+
str(harbor_bin),
|
|
78
|
+
"jobs",
|
|
79
|
+
"start",
|
|
80
|
+
*source_flags,
|
|
81
|
+
"-a",
|
|
82
|
+
"terminus-2",
|
|
83
|
+
"-m",
|
|
84
|
+
args.model,
|
|
85
|
+
"--n-attempts",
|
|
86
|
+
args.n_attempts,
|
|
87
|
+
"--n-concurrent",
|
|
88
|
+
args.n_concurrent,
|
|
89
|
+
"--job-name",
|
|
90
|
+
job_name,
|
|
91
|
+
"-y",
|
|
92
|
+
]
|
|
93
|
+
if args.n_tasks:
|
|
94
|
+
cmd += ["--n-tasks", args.n_tasks]
|
|
95
|
+
if args.max_turns:
|
|
96
|
+
cmd += ["--ak", f"max_turns={args.max_turns}"]
|
|
97
|
+
if args.force_build:
|
|
98
|
+
cmd += ["--force-build"]
|
|
99
|
+
|
|
100
|
+
# Forward handler-declared artifacts and verifier env vars
|
|
101
|
+
dataset_name = args.task.split("/")[0]
|
|
102
|
+
handler = handler_registry.get(dataset_name)
|
|
103
|
+
|
|
104
|
+
artifacts = list(args.artifact or [])
|
|
105
|
+
if handler:
|
|
106
|
+
for path in handler.artifacts():
|
|
107
|
+
if path not in artifacts:
|
|
108
|
+
artifacts.append(path)
|
|
109
|
+
for artifact in artifacts:
|
|
110
|
+
cmd += ["--artifact", artifact]
|
|
111
|
+
|
|
112
|
+
if handler:
|
|
113
|
+
for key in handler.verifier_env_keys():
|
|
114
|
+
if val := os.environ.get(key):
|
|
115
|
+
cmd += ["--ve", f"{key}={val}"]
|
|
116
|
+
|
|
117
|
+
subprocess.run(cmd, check=True)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
harborforge/__init__.py,sha256=QfgLX95O8k5nUGBfkZ_DEU5Kth74F_KCdK7-H2fSsMQ,216
|
|
2
|
+
harborforge/mapper.py,sha256=fl8jQBsPvY_OJb6ZNoTZZ-I4vov9hRProS3EsIxnt0U,5588
|
|
3
|
+
harborforge/registry.py,sha256=6VshQrjnMoQQjrUuwgj1Rg4DkJc_fchVyAHvDYvYDVM,1549
|
|
4
|
+
harborforge/run.py,sha256=opKStzKmNvSCa6BSyMnGcQxJdLOLyUBaVzhZH6EjWc4,3959
|
|
5
|
+
harborforge/handlers/__init__.py,sha256=14x5nOh5-Rt8J_yZpQCqeD0Xod0eLQmF3VZjVV5uFhA,363
|
|
6
|
+
harborforge/handlers/base.py,sha256=yopagTLuo_k6CKfZuMcaeaUzk9rdTuayz-hIqj-MNyw,6591
|
|
7
|
+
harborforge-1.0.0.dist-info/METADATA,sha256=1XhEcaG-22cVfMUuYXDXtZcPzHrrGTILf0gSGvrFX-c,227
|
|
8
|
+
harborforge-1.0.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
9
|
+
harborforge-1.0.0.dist-info/RECORD,,
|