harborforge 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- harborforge-1.0.0/.github/workflows/ci.yml +25 -0
- harborforge-1.0.0/.gitignore +19 -0
- harborforge-1.0.0/CLAUDE.md +136 -0
- harborforge-1.0.0/PKG-INFO +8 -0
- harborforge-1.0.0/dist/.gitignore +1 -0
- harborforge-1.0.0/harborforge/__init__.py +6 -0
- harborforge-1.0.0/harborforge/handlers/__init__.py +19 -0
- harborforge-1.0.0/harborforge/handlers/base.py +231 -0
- harborforge-1.0.0/harborforge/mapper.py +142 -0
- harborforge-1.0.0/harborforge/registry.py +61 -0
- harborforge-1.0.0/harborforge/run.py +117 -0
- harborforge-1.0.0/justfile +18 -0
- harborforge-1.0.0/pyproject.toml +35 -0
- harborforge-1.0.0/readme.md +82 -0
- harborforge-1.0.0/tests/__init__.py +0 -0
- harborforge-1.0.0/tests/harborforge/__init__.py +0 -0
- harborforge-1.0.0/tests/harborforge/test_handlers_base.py +255 -0
- harborforge-1.0.0/tests/harborforge/test_mapper.py +180 -0
- harborforge-1.0.0/tests/harborforge/test_registry.py +94 -0
- harborforge-1.0.0/tests/harborforge/test_run.py +23 -0
- harborforge-1.0.0/uv.lock +1827 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
- uses: astral-sh/setup-uv@v5
|
|
14
|
+
with:
|
|
15
|
+
python-version: "3.13"
|
|
16
|
+
- name: Install dependencies
|
|
17
|
+
run: uv sync --extra dev
|
|
18
|
+
- name: Format check
|
|
19
|
+
run: uv run ruff format --check .
|
|
20
|
+
- name: Lint
|
|
21
|
+
run: uv run ruff check .
|
|
22
|
+
- name: Type check
|
|
23
|
+
run: uv run mypy harborforge/
|
|
24
|
+
- name: Tests
|
|
25
|
+
run: uv run pytest tests/ -v
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# CLAUDE.md — g9
|
|
2
|
+
|
|
3
|
+
## What this is
|
|
4
|
+
|
|
5
|
+
**g9** maps evaluation benchmark datasets into [Harbor](https://github.com/laude-institute/harbor)-compatible task directories, enabling large-scale parallel agent evaluation.
|
|
6
|
+
|
|
7
|
+
The core package (`harborforge/`) is benchmark-agnostic. Each benchmark gets its own adapter in `adapters/`. The first reference implementation is DSGym.
|
|
8
|
+
|
|
9
|
+
## Repo layout
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
harborforge/ # pip-installable core — abstract contracts only
|
|
13
|
+
mapper.py # DataMapper base class (setup, map, run)
|
|
14
|
+
run.py # Generic Harbor trial runner (wraps harbor jobs start)
|
|
15
|
+
registry.py # generate_registry() — produces registry.json
|
|
16
|
+
handlers/
|
|
17
|
+
base.py # DatasetHandler base class + verifier templates
|
|
18
|
+
|
|
19
|
+
adapters/ # concrete benchmark implementations
|
|
20
|
+
dsgym/
|
|
21
|
+
mapper.py # DSGymMapper(DataMapper)
|
|
22
|
+
base_image.py # BASE_IMAGE_NAME, BASE_IMAGE constants
|
|
23
|
+
eval.Dockerfile # shared base Docker image — built by `just build-base`
|
|
24
|
+
run_trial.py # thin wrapper: calls harborforge.run with DSGym config
|
|
25
|
+
handlers/ # one file per DSGym dataset type
|
|
26
|
+
download.py # Kaggle dataset downloader
|
|
27
|
+
|
|
28
|
+
tests/
|
|
29
|
+
harborforge/ # unit + integration tests for core
|
|
30
|
+
adapters/dsgym/ # tests for the DSGym adapter
|
|
31
|
+
|
|
32
|
+
tools/
|
|
33
|
+
benchmark_builds.py # Docker build-time benchmark (base image vs no base)
|
|
34
|
+
|
|
35
|
+
.data/ # local data (gitignored)
|
|
36
|
+
task/ # raw DSGym task files (.json / .jsonl)
|
|
37
|
+
data/ # raw data files (CSVs, .h5ad, etc.)
|
|
38
|
+
tasks/ # generated Harbor task directories
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Setup
|
|
42
|
+
|
|
43
|
+
```sh
|
|
44
|
+
pip install uv # Python package manager
|
|
45
|
+
brew install just # task runner
|
|
46
|
+
|
|
47
|
+
just setup # create venv + install deps + download data
|
|
48
|
+
just build-base # build shared Docker base image (run once)
|
|
49
|
+
just data # download raw data + run mapper → .data/tasks/
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Credentials go in `.env` (gitignored):
|
|
53
|
+
```
|
|
54
|
+
ANTHROPIC_API_KEY=sk-ant-...
|
|
55
|
+
KAGGLE_TOKEN=KGAT_... # for DSPredict tasks
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Running things
|
|
59
|
+
|
|
60
|
+
```sh
|
|
61
|
+
just trial daeval/0 # single task
|
|
62
|
+
just trial daeval -l 20 # 20 tasks from a dataset
|
|
63
|
+
just trial dspredict/titanic # Kaggle competition task
|
|
64
|
+
just trial daeval/0 -k 3 # 3 attempts
|
|
65
|
+
just test # pytest
|
|
66
|
+
just format # ruff + mypy
|
|
67
|
+
just view # open Harbor trajectory viewer
|
|
68
|
+
just benchmark # Docker build-time benchmark
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Adding a new benchmark adapter
|
|
72
|
+
|
|
73
|
+
1. Create `adapters/<name>/` with:
|
|
74
|
+
- `handlers/` — one `DatasetHandler` subclass per dataset type
|
|
75
|
+
- `mapper.py` — `DataMapper` subclass implementing `iter_tasks()`
|
|
76
|
+
- `__main__.py` — calls `DataMapper().run(output_dir, registry_path)`
|
|
77
|
+
- `run_trial.py` — calls `harborforge.run.run()` with your registry
|
|
78
|
+
|
|
79
|
+
2. Implement the handler contract (see table below).
|
|
80
|
+
|
|
81
|
+
3. Add a `just data <name>` call — the existing recipe dispatches to `python -m adapters.<name>`.
|
|
82
|
+
|
|
83
|
+
See `adapters/dsgym/` as the reference implementation.
|
|
84
|
+
|
|
85
|
+
## Handler contract
|
|
86
|
+
|
|
87
|
+
Every handler subclasses `harborforge.handlers.DatasetHandler`:
|
|
88
|
+
|
|
89
|
+
| Method | Required | Purpose |
|
|
90
|
+
|--------|----------|---------|
|
|
91
|
+
| `instruction(task_data)` | ✅ | Content for `instruction.md` — no answer leakage |
|
|
92
|
+
| `test_sh(task_data)` | ✅ | Content for `tests/test.sh` — must write float reward to `/logs/verifier/reward.txt` |
|
|
93
|
+
| `dockerfile(task_data)` | ✅ | Content for `environment/Dockerfile` |
|
|
94
|
+
| `setup()` | optional | Download/prepare data for this dataset |
|
|
95
|
+
| `data_files(task_data)` | optional | Local files to COPY into the image build context |
|
|
96
|
+
| `artifacts()` | optional | Container paths to capture after trial |
|
|
97
|
+
| `verifier_env_keys()` | optional | Env var keys to forward to the SEPARATE verifier |
|
|
98
|
+
| `verifier_dockerfile(task_data)` | optional | Non-None triggers SEPARATE verifier mode |
|
|
99
|
+
|
|
100
|
+
## Harbor task contract
|
|
101
|
+
|
|
102
|
+
Each generated task directory must contain:
|
|
103
|
+
|
|
104
|
+
```
|
|
105
|
+
<task>/
|
|
106
|
+
├── instruction.md # shown to the agent (no answer)
|
|
107
|
+
├── task.toml # Harbor config (timeouts, env)
|
|
108
|
+
├── environment/
|
|
109
|
+
│ └── Dockerfile # agent container — data files go here
|
|
110
|
+
└── tests/
|
|
111
|
+
├── test.sh # verifier script — writes reward to /logs/verifier/reward.txt
|
|
112
|
+
└── Dockerfile # (SEPARATE mode only) verifier container
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Verifier modes
|
|
116
|
+
|
|
117
|
+
**SHARED** (default): verifier runs inside the agent container after the agent finishes.
|
|
118
|
+
|
|
119
|
+
**SEPARATE**: verifier runs in its own isolated container. Triggered when `handler.verifier_dockerfile()` returns non-None. Harbor re-materializes agent artifacts at their original source paths inside the verifier container — e.g. `/app/submission.csv` in the agent maps to `/app/submission.csv` in the verifier.
|
|
120
|
+
|
|
121
|
+
## Code style
|
|
122
|
+
|
|
123
|
+
- Simple and concise over clever. Three similar lines beat a premature abstraction.
|
|
124
|
+
- No comments on obvious code. Only comment on non-obvious invariants or workarounds.
|
|
125
|
+
- No dead code. No backwards-compat shims, no unused parameters.
|
|
126
|
+
- Validate at system boundaries only. Trust internal code.
|
|
127
|
+
- Type hints on all public APIs.
|
|
128
|
+
- `harborforge/` must stay benchmark-agnostic. No benchmark-specific strings belong there.
|
|
129
|
+
- Base image: always import `BASE_IMAGE` from `adapters/<name>/base_image.py`. Never hardcode `FROM python:...` in handlers.
|
|
130
|
+
|
|
131
|
+
## Python conventions
|
|
132
|
+
|
|
133
|
+
- Python 3.13+, managed with `uv`.
|
|
134
|
+
- `pyproject.toml` is the source of truth for dependencies.
|
|
135
|
+
- Tests live in `tests/`, mirror the package structure, use `pytest`.
|
|
136
|
+
- Formatters: `ruff` for linting/formatting, `mypy` for types. Run `just format` before committing.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
*
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from .base import (
|
|
2
|
+
DatasetHandler,
|
|
3
|
+
case_insensitive_verifier,
|
|
4
|
+
exact_match_verifier,
|
|
5
|
+
list_verifier,
|
|
6
|
+
no_verifier,
|
|
7
|
+
numeric_verifier,
|
|
8
|
+
script_verifier,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"DatasetHandler",
|
|
13
|
+
"case_insensitive_verifier",
|
|
14
|
+
"exact_match_verifier",
|
|
15
|
+
"list_verifier",
|
|
16
|
+
"no_verifier",
|
|
17
|
+
"numeric_verifier",
|
|
18
|
+
"script_verifier",
|
|
19
|
+
]
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DatasetHandler contract.
|
|
3
|
+
|
|
4
|
+
Each dataset subclass must implement:
|
|
5
|
+
- instruction(task_data) → str content for instruction.md (shown to the agent, no answer leakage)
|
|
6
|
+
- test_sh(task_data) → str content for tests/test.sh (must write reward to /logs/verifier/reward.txt)
|
|
7
|
+
- dockerfile(task_data) → str content for environment/Dockerfile
|
|
8
|
+
|
|
9
|
+
Optional overrides:
|
|
10
|
+
- data_files(task_data) → list[tuple[Path, str]] (local_path, dest_in_build_context) pairs
|
|
11
|
+
- task_toml(task_id) → str content for task.toml
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from abc import ABC, abstractmethod
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DatasetHandler(ABC):
|
|
20
|
+
dataset_name: str
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def instruction(self, task_data: dict[str, Any]) -> str:
|
|
24
|
+
"""Returns instruction.md content — what the agent sees. Must not include the answer."""
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def test_sh(self, task_data: dict[str, Any]) -> str:
|
|
28
|
+
"""Returns tests/test.sh content. Must write a float reward to /logs/verifier/reward.txt."""
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def dockerfile(self, task_data: dict[str, Any]) -> str:
|
|
32
|
+
"""Returns environment/Dockerfile content."""
|
|
33
|
+
|
|
34
|
+
def verifier_dockerfile(self, task_data: dict[str, Any]) -> str | None:
|
|
35
|
+
"""Return a Dockerfile for a SEPARATE verifier container, or None for SHARED (default).
|
|
36
|
+
|
|
37
|
+
When non-None, Harbor routes the trial through SEPARATE mode: the verifier runs in
|
|
38
|
+
its own isolated container and receives agent artifacts as input. Use this when the
|
|
39
|
+
verifier needs different dependencies or ground truth data isolated from the agent.
|
|
40
|
+
"""
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
def setup(self) -> None:
|
|
44
|
+
"""Download or prepare local data required by this dataset. No-op by default."""
|
|
45
|
+
|
|
46
|
+
def artifacts(self) -> list[str]:
|
|
47
|
+
"""Container paths to capture as artifacts after a trial. Empty by default."""
|
|
48
|
+
return []
|
|
49
|
+
|
|
50
|
+
def verifier_env_keys(self) -> list[str]:
|
|
51
|
+
"""Environment variable keys to forward to the SEPARATE verifier container."""
|
|
52
|
+
return []
|
|
53
|
+
|
|
54
|
+
def data_files(self, task_data: dict[str, Any]) -> list[tuple[Path, str]]:
|
|
55
|
+
"""
|
|
56
|
+
Returns (local_path, dest_in_build_context) pairs for files to COPY into the image.
|
|
57
|
+
dest_in_build_context is relative to environment/ (e.g. 'data/train.csv').
|
|
58
|
+
"""
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
def task_toml(self, task_id: str) -> str:
|
|
62
|
+
return f"""\
|
|
63
|
+
version = "1.0"
|
|
64
|
+
|
|
65
|
+
[task]
|
|
66
|
+
name = "{task_id}"
|
|
67
|
+
|
|
68
|
+
[metadata]
|
|
69
|
+
|
|
70
|
+
[verifier]
|
|
71
|
+
timeout_sec = 900.0
|
|
72
|
+
|
|
73
|
+
[agent]
|
|
74
|
+
timeout_sec = 900.0
|
|
75
|
+
|
|
76
|
+
[environment]
|
|
77
|
+
build_timeout_sec = 600.0
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# Verifier shell script templates
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def exact_match_verifier(answer: Any) -> str:
|
|
87
|
+
"""Reward 1.0 if agent output matches expected answer (whitespace-stripped string comparison)."""
|
|
88
|
+
safe = str(answer).replace("'", "'\\''")
|
|
89
|
+
return f"""\
|
|
90
|
+
#!/bin/bash
|
|
91
|
+
set -e
|
|
92
|
+
mkdir -p /logs/verifier
|
|
93
|
+
|
|
94
|
+
EXPECTED='{safe}'
|
|
95
|
+
SUBMISSION=/app/submission.txt
|
|
96
|
+
|
|
97
|
+
if [ ! -f "$SUBMISSION" ]; then
|
|
98
|
+
echo "0" > /logs/verifier/reward.txt
|
|
99
|
+
exit 0
|
|
100
|
+
fi
|
|
101
|
+
|
|
102
|
+
ACTUAL=$(cat "$SUBMISSION" | tr -d '[:space:]')
|
|
103
|
+
EXPECTED_CLEAN=$(echo "$EXPECTED" | tr -d '[:space:]')
|
|
104
|
+
|
|
105
|
+
if [ "$ACTUAL" = "$EXPECTED_CLEAN" ]; then
|
|
106
|
+
echo "1" > /logs/verifier/reward.txt
|
|
107
|
+
else
|
|
108
|
+
echo "0" > /logs/verifier/reward.txt
|
|
109
|
+
fi
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def numeric_verifier(answer: Any, tolerance: float = 0.05) -> str:
|
|
114
|
+
"""Reward 1.0 if agent output is within `tolerance` (relative) of expected numeric value."""
|
|
115
|
+
safe = str(answer).replace("'", "'\\''")
|
|
116
|
+
return f"""\
|
|
117
|
+
#!/bin/bash
|
|
118
|
+
set -e
|
|
119
|
+
mkdir -p /logs/verifier
|
|
120
|
+
|
|
121
|
+
EXPECTED='{safe}'
|
|
122
|
+
|
|
123
|
+
python3 - "$EXPECTED" <<'PYEOF'
|
|
124
|
+
import sys
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
expected = float(sys.argv[1])
|
|
128
|
+
with open("/app/submission.txt") as f:
|
|
129
|
+
actual = float(f.read().strip())
|
|
130
|
+
denom = abs(expected) if expected != 0 else 1.0
|
|
131
|
+
reward = 1.0 if abs(actual - expected) / denom <= {tolerance} else 0.0
|
|
132
|
+
except Exception:
|
|
133
|
+
reward = 0.0
|
|
134
|
+
|
|
135
|
+
with open("/logs/verifier/reward.txt", "w") as f:
|
|
136
|
+
f.write(str(reward))
|
|
137
|
+
PYEOF
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def list_verifier(answer: Any) -> str:
|
|
142
|
+
"""Reward 1.0 if agent output contains all expected items (order-insensitive, case-insensitive).
|
|
143
|
+
Expected answer should be a Python list repr or comma-separated string."""
|
|
144
|
+
safe = str(answer).replace("'", "'\\''")
|
|
145
|
+
return f"""\
|
|
146
|
+
#!/bin/bash
|
|
147
|
+
set -e
|
|
148
|
+
mkdir -p /logs/verifier
|
|
149
|
+
|
|
150
|
+
python3 <<'PYEOF'
|
|
151
|
+
import ast, re
|
|
152
|
+
|
|
153
|
+
EXPECTED_RAW = '{safe}'
|
|
154
|
+
try:
|
|
155
|
+
with open("/app/submission.txt") as f:
|
|
156
|
+
actual_raw = f.read().strip()
|
|
157
|
+
except FileNotFoundError:
|
|
158
|
+
open("/logs/verifier/reward.txt", "w").write("0")
|
|
159
|
+
raise SystemExit
|
|
160
|
+
|
|
161
|
+
def parse_list(s):
|
|
162
|
+
try:
|
|
163
|
+
val = ast.literal_eval(s)
|
|
164
|
+
if isinstance(val, list):
|
|
165
|
+
return {{str(x).strip().lower() for x in val}}
|
|
166
|
+
except Exception:
|
|
167
|
+
pass
|
|
168
|
+
return {{x.strip().lower() for x in re.split(r"[,\\n]+", s) if x.strip()}}
|
|
169
|
+
|
|
170
|
+
expected = parse_list(EXPECTED_RAW)
|
|
171
|
+
actual = parse_list(actual_raw)
|
|
172
|
+
reward = 1.0 if expected and expected == actual else 0.0
|
|
173
|
+
open("/logs/verifier/reward.txt", "w").write(str(reward))
|
|
174
|
+
PYEOF
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def case_insensitive_verifier(answer: Any) -> str:
|
|
179
|
+
"""Reward 1.0 if agent output matches expected answer, ignoring case and whitespace."""
|
|
180
|
+
safe = str(answer).replace("'", "'\\''")
|
|
181
|
+
return f"""\
|
|
182
|
+
#!/bin/bash
|
|
183
|
+
set -e
|
|
184
|
+
mkdir -p /logs/verifier
|
|
185
|
+
|
|
186
|
+
EXPECTED='{safe}'
|
|
187
|
+
SUBMISSION=/app/submission.txt
|
|
188
|
+
|
|
189
|
+
if [ ! -f "$SUBMISSION" ]; then
|
|
190
|
+
echo "0" > /logs/verifier/reward.txt
|
|
191
|
+
exit 0
|
|
192
|
+
fi
|
|
193
|
+
|
|
194
|
+
ACTUAL=$(cat "$SUBMISSION" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]')
|
|
195
|
+
EXPECTED_CLEAN=$(echo "$EXPECTED" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]')
|
|
196
|
+
|
|
197
|
+
if [ "$ACTUAL" = "$EXPECTED_CLEAN" ]; then
|
|
198
|
+
echo "1" > /logs/verifier/reward.txt
|
|
199
|
+
else
|
|
200
|
+
echo "0" > /logs/verifier/reward.txt
|
|
201
|
+
fi
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def script_verifier(script_container_path: str) -> str:
|
|
206
|
+
"""Runs a Python eval script already present in the container.
|
|
207
|
+
The script is fully responsible for writing a float reward to /logs/verifier/reward.txt."""
|
|
208
|
+
return f"""\
|
|
209
|
+
#!/bin/bash
|
|
210
|
+
set -e
|
|
211
|
+
mkdir -p /logs/verifier
|
|
212
|
+
|
|
213
|
+
SCRIPT="{script_container_path}"
|
|
214
|
+
|
|
215
|
+
if [ ! -f "$SCRIPT" ]; then
|
|
216
|
+
echo "0" > /logs/verifier/reward.txt
|
|
217
|
+
exit 0
|
|
218
|
+
fi
|
|
219
|
+
|
|
220
|
+
python3 "$SCRIPT" || echo "0" > /logs/verifier/reward.txt
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def no_verifier(reason: str = "External evaluation required") -> str:
|
|
225
|
+
"""Always writes reward 0. Used for tasks that require external scoring (e.g. leaderboards)."""
|
|
226
|
+
return f"""\
|
|
227
|
+
#!/bin/bash
|
|
228
|
+
# {reason}
|
|
229
|
+
mkdir -p /logs/verifier
|
|
230
|
+
echo "0" > /logs/verifier/reward.txt
|
|
231
|
+
"""
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DataMapper — abstract base for benchmark → Harbor task directory pipelines.
|
|
3
|
+
|
|
4
|
+
Subclass and implement `iter_tasks()`. Override `setup()` to handle data
|
|
5
|
+
downloads or prep. Call `run()` as the standard entry point, or `map()`
|
|
6
|
+
directly for more control.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import shutil
|
|
10
|
+
import stat
|
|
11
|
+
from abc import ABC, abstractmethod
|
|
12
|
+
from collections.abc import Iterator
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from .handlers.base import DatasetHandler
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DataMapper(ABC):
|
|
20
|
+
"""
|
|
21
|
+
Abstract mapper from a benchmark dataset to Harbor task directories.
|
|
22
|
+
|
|
23
|
+
Minimal adapter implementation:
|
|
24
|
+
1. Implement `iter_tasks()` to yield tasks.
|
|
25
|
+
2. Optionally override `setup()` to download/prepare raw data.
|
|
26
|
+
3. Call `run()` as the entry point — it handles setup, mapping, and registry.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
# Public API
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
def setup(self) -> None:
|
|
34
|
+
"""Download or prepare raw data before mapping. No-op by default."""
|
|
35
|
+
|
|
36
|
+
def map(self, output_dir: Path, registry_path: Path | None = None) -> int:
|
|
37
|
+
"""
|
|
38
|
+
Write all tasks to output_dir as Harbor task directories.
|
|
39
|
+
Wipes and recreates output_dir on each call.
|
|
40
|
+
If registry_path is given, auto-generates a Harbor registry.json.
|
|
41
|
+
Returns the number of tasks written.
|
|
42
|
+
"""
|
|
43
|
+
if output_dir.exists():
|
|
44
|
+
shutil.rmtree(output_dir)
|
|
45
|
+
|
|
46
|
+
total = 0
|
|
47
|
+
current_dataset = ""
|
|
48
|
+
for task_id, dir_name, handler, task_data in self.iter_tasks():
|
|
49
|
+
self._write_task(output_dir / dir_name, task_id, handler, task_data)
|
|
50
|
+
total += 1
|
|
51
|
+
dataset = dir_name.split("/")[0]
|
|
52
|
+
if dataset != current_dataset:
|
|
53
|
+
if current_dataset:
|
|
54
|
+
print()
|
|
55
|
+
current_dataset = dataset
|
|
56
|
+
print(f" ↳ {dataset}", end="", flush=True)
|
|
57
|
+
print(f"\r ↳ {dataset} ({total} written)", end="", flush=True)
|
|
58
|
+
if total:
|
|
59
|
+
print()
|
|
60
|
+
|
|
61
|
+
if registry_path is not None:
|
|
62
|
+
from harborforge.registry import generate_registry
|
|
63
|
+
|
|
64
|
+
counts = generate_registry(output_dir, registry_path)
|
|
65
|
+
n_datasets = len(counts)
|
|
66
|
+
n_tasks = sum(counts.values())
|
|
67
|
+
print(f"✅ {registry_path} — {n_tasks} tasks across {n_datasets} datasets")
|
|
68
|
+
|
|
69
|
+
return total
|
|
70
|
+
|
|
71
|
+
def run(self, output_dir: Path, registry_path: Path | None = None) -> None:
|
|
72
|
+
"""
|
|
73
|
+
Standard entry point: setup() → map() → optional registry.
|
|
74
|
+
Adapters call this from __main__.py.
|
|
75
|
+
"""
|
|
76
|
+
self.setup()
|
|
77
|
+
total = self.map(output_dir, registry_path)
|
|
78
|
+
print(f"📊 Total tasks written: {total}")
|
|
79
|
+
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
# Abstract
|
|
82
|
+
# ---------------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
@abstractmethod
|
|
85
|
+
def iter_tasks(self) -> Iterator[tuple[str, str, DatasetHandler, dict[str, Any]]]:
|
|
86
|
+
"""
|
|
87
|
+
Yield (task_id, dir_name, handler, raw_task_data) for each task.
|
|
88
|
+
|
|
89
|
+
- task_id: unique identifier used in task.toml name field
|
|
90
|
+
- dir_name: relative path under output_dir (e.g. 'daeval/0')
|
|
91
|
+
- handler: DatasetHandler instance for this dataset
|
|
92
|
+
- raw_task_data: raw dict from the source benchmark file
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
# Internal
|
|
97
|
+
# ---------------------------------------------------------------------------
|
|
98
|
+
|
|
99
|
+
def _write_task(
|
|
100
|
+
self,
|
|
101
|
+
task_dir: Path,
|
|
102
|
+
task_id: str,
|
|
103
|
+
handler: DatasetHandler,
|
|
104
|
+
task_data: dict[str, Any],
|
|
105
|
+
) -> None:
|
|
106
|
+
task_dir.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
|
|
108
|
+
(task_dir / "instruction.md").write_text(handler.instruction(task_data), encoding="utf-8")
|
|
109
|
+
|
|
110
|
+
env_dir = task_dir / "environment"
|
|
111
|
+
env_dir.mkdir(exist_ok=True)
|
|
112
|
+
|
|
113
|
+
for local_path, dest_name in handler.data_files(task_data):
|
|
114
|
+
dest = env_dir / dest_name
|
|
115
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
116
|
+
shutil.copy2(local_path, dest)
|
|
117
|
+
|
|
118
|
+
(env_dir / "Dockerfile").write_text(handler.dockerfile(task_data), encoding="utf-8")
|
|
119
|
+
|
|
120
|
+
# Auto-detect verifier mode: SEPARATE if handler provides a verifier Dockerfile.
|
|
121
|
+
# Harbor uses tests/ as the verifier build context in SEPARATE mode.
|
|
122
|
+
task_toml = handler.task_toml(task_id)
|
|
123
|
+
verifier_df = handler.verifier_dockerfile(task_data)
|
|
124
|
+
if verifier_df is not None:
|
|
125
|
+
task_toml += "\n[verifier.environment]\nbuild_timeout_sec = 300.0\n"
|
|
126
|
+
|
|
127
|
+
(task_dir / "task.toml").write_text(task_toml, encoding="utf-8")
|
|
128
|
+
|
|
129
|
+
tests_dir = task_dir / "tests"
|
|
130
|
+
tests_dir.mkdir(exist_ok=True)
|
|
131
|
+
test_sh = tests_dir / "test.sh"
|
|
132
|
+
test_sh.write_text(handler.test_sh(task_data), encoding="utf-8")
|
|
133
|
+
test_sh.chmod(test_sh.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
|
|
134
|
+
|
|
135
|
+
# SEPARATE mode: write verifier Dockerfile into tests/ (Harbor's verifier build context).
|
|
136
|
+
# COPY test.sh into the image so Harbor can execute /tests/test.sh inside the container.
|
|
137
|
+
if verifier_df is not None:
|
|
138
|
+
verifier_df_with_copy = (
|
|
139
|
+
verifier_df.rstrip()
|
|
140
|
+
+ "\nCOPY test.sh /tests/test.sh\nRUN chmod +x /tests/test.sh\n"
|
|
141
|
+
)
|
|
142
|
+
(tests_dir / "Dockerfile").write_text(verifier_df_with_copy, encoding="utf-8")
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generate a local Harbor registry.json from a directory of Harbor task directories.
|
|
3
|
+
|
|
4
|
+
The tasks directory is expected to have the structure:
|
|
5
|
+
tasks_dir/
|
|
6
|
+
dataset_a/
|
|
7
|
+
task_0/
|
|
8
|
+
task_1/
|
|
9
|
+
dataset_b/
|
|
10
|
+
task_0/
|
|
11
|
+
|
|
12
|
+
Each top-level subdirectory becomes a named dataset in the registry.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def generate_registry(
|
|
20
|
+
tasks_dir: Path,
|
|
21
|
+
output_path: Path,
|
|
22
|
+
version: str = "1.0",
|
|
23
|
+
description_template: str = "{name} benchmark tasks",
|
|
24
|
+
) -> dict[str, int]:
|
|
25
|
+
"""
|
|
26
|
+
Scan tasks_dir and write a Harbor registry.json to output_path.
|
|
27
|
+
|
|
28
|
+
Returns a dict of {dataset_name: task_count}.
|
|
29
|
+
"""
|
|
30
|
+
if not tasks_dir.exists():
|
|
31
|
+
raise FileNotFoundError(f"Tasks directory not found: {tasks_dir}")
|
|
32
|
+
|
|
33
|
+
datasets = []
|
|
34
|
+
counts: dict[str, int] = {}
|
|
35
|
+
|
|
36
|
+
for dataset_dir in sorted(tasks_dir.iterdir()):
|
|
37
|
+
if not dataset_dir.is_dir():
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
tasks = [
|
|
41
|
+
{"name": task.name, "path": str(task)}
|
|
42
|
+
for task in sorted(dataset_dir.iterdir())
|
|
43
|
+
if task.is_dir()
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
if not tasks:
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
name = dataset_dir.name.lower()
|
|
50
|
+
datasets.append(
|
|
51
|
+
{
|
|
52
|
+
"name": name,
|
|
53
|
+
"version": version,
|
|
54
|
+
"description": description_template.format(name=dataset_dir.name),
|
|
55
|
+
"tasks": tasks,
|
|
56
|
+
}
|
|
57
|
+
)
|
|
58
|
+
counts[name] = len(tasks)
|
|
59
|
+
|
|
60
|
+
output_path.write_text(json.dumps(datasets, indent=2))
|
|
61
|
+
return counts
|