harborforge 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - uses: astral-sh/setup-uv@v5
14
+ with:
15
+ python-version: "3.13"
16
+ - name: Install dependencies
17
+ run: uv sync --extra dev
18
+ - name: Format check
19
+ run: uv run ruff format --check .
20
+ - name: Lint
21
+ run: uv run ruff check .
22
+ - name: Type check
23
+ run: uv run mypy harborforge/
24
+ - name: Tests
25
+ run: uv run pytest tests/ -v
@@ -0,0 +1,19 @@
1
+ .DS_Store
2
+ .*
3
+ !.github
4
+ !.github/**
5
+ !.gitignore
6
+ requirements.txt
7
+ __pycache__
8
+ *.pyc
9
+ *.pyo
10
+ *.pyd
11
+ *.pyw
12
+ *.pyz
13
+ *.pywz
14
+ *.pyzw
15
+ *.pyzwz
16
+ trials/
17
+ jobs/
18
+ registry.json
19
+ .docs/
@@ -0,0 +1,136 @@
1
+ # CLAUDE.md — g9
2
+
3
+ ## What this is
4
+
5
+ **g9** maps evaluation benchmark datasets into [Harbor](https://github.com/laude-institute/harbor)-compatible task directories, enabling large-scale parallel agent evaluation.
6
+
7
+ The core package (`harborforge/`) is benchmark-agnostic. Each benchmark gets its own adapter in `adapters/`. The first reference implementation is DSGym.
8
+
9
+ ## Repo layout
10
+
11
+ ```
12
+ harborforge/ # pip-installable core — abstract contracts only
13
+ mapper.py # DataMapper base class (setup, map, run)
14
+ run.py # Generic Harbor trial runner (wraps harbor jobs start)
15
+ registry.py # generate_registry() — produces registry.json
16
+ handlers/
17
+ base.py # DatasetHandler base class + verifier templates
18
+
19
+ adapters/ # concrete benchmark implementations
20
+ dsgym/
21
+ mapper.py # DSGymMapper(DataMapper)
22
+ base_image.py # BASE_IMAGE_NAME, BASE_IMAGE constants
23
+ eval.Dockerfile # shared base Docker image — built by `just build-base`
24
+ run_trial.py # thin wrapper: calls harborforge.run with DSGym config
25
+ handlers/ # one file per DSGym dataset type
26
+ download.py # Kaggle dataset downloader
27
+
28
+ tests/
29
+ harborforge/ # unit + integration tests for core
30
+ adapters/dsgym/ # tests for the DSGym adapter
31
+
32
+ tools/
33
+ benchmark_builds.py # Docker build-time benchmark (base image vs no base)
34
+
35
+ .data/ # local data (gitignored)
36
+ task/ # raw DSGym task files (.json / .jsonl)
37
+ data/ # raw data files (CSVs, .h5ad, etc.)
38
+ tasks/ # generated Harbor task directories
39
+ ```
40
+
41
+ ## Setup
42
+
43
+ ```sh
44
+ pip install uv # Python package manager
45
+ brew install just # task runner
46
+
47
+ just setup # create venv + install deps + download data
48
+ just build-base # build shared Docker base image (run once)
49
+ just data # download raw data + run mapper → .data/tasks/
50
+ ```
51
+
52
+ Credentials go in `.env` (gitignored):
53
+ ```
54
+ ANTHROPIC_API_KEY=sk-ant-...
55
+ KAGGLE_TOKEN=KGAT_... # for DSPredict tasks
56
+ ```
57
+
58
+ ## Running things
59
+
60
+ ```sh
61
+ just trial daeval/0 # single task
62
+ just trial daeval -l 20 # 20 tasks from a dataset
63
+ just trial dspredict/titanic # Kaggle competition task
64
+ just trial daeval/0 -k 3 # 3 attempts
65
+ just test # pytest
66
+ just format # ruff + mypy
67
+ just view # open Harbor trajectory viewer
68
+ just benchmark # Docker build-time benchmark
69
+ ```
70
+
71
+ ## Adding a new benchmark adapter
72
+
73
+ 1. Create `adapters/<name>/` with:
74
+ - `handlers/` — one `DatasetHandler` subclass per dataset type
75
+ - `mapper.py` — `DataMapper` subclass implementing `iter_tasks()`
76
+ - `__main__.py` — calls `DataMapper().run(output_dir, registry_path)`
77
+ - `run_trial.py` — calls `harborforge.run.run()` with your registry
78
+
79
+ 2. Implement the handler contract (see table below).
80
+
81
+ 3. Add a `just data <name>` call — the existing recipe dispatches to `python -m adapters.<name>`.
82
+
83
+ See `adapters/dsgym/` as the reference implementation.
84
+
85
+ ## Handler contract
86
+
87
+ Every handler subclasses `harborforge.handlers.DatasetHandler`:
88
+
89
+ | Method | Required | Purpose |
90
+ |--------|----------|---------|
91
+ | `instruction(task_data)` | ✅ | Content for `instruction.md` — no answer leakage |
92
+ | `test_sh(task_data)` | ✅ | Content for `tests/test.sh` — must write float reward to `/logs/verifier/reward.txt` |
93
+ | `dockerfile(task_data)` | ✅ | Content for `environment/Dockerfile` |
94
+ | `setup()` | optional | Download/prepare data for this dataset |
95
+ | `data_files(task_data)` | optional | Local files to COPY into the image build context |
96
+ | `artifacts()` | optional | Container paths to capture after trial |
97
+ | `verifier_env_keys()` | optional | Env var keys to forward to the SEPARATE verifier |
98
+ | `verifier_dockerfile(task_data)` | optional | Non-None triggers SEPARATE verifier mode |
99
+
100
+ ## Harbor task contract
101
+
102
+ Each generated task directory must contain:
103
+
104
+ ```
105
+ <task>/
106
+ ├── instruction.md # shown to the agent (no answer)
107
+ ├── task.toml # Harbor config (timeouts, env)
108
+ ├── environment/
109
+ │ └── Dockerfile # agent container — data files go here
110
+ └── tests/
111
+ ├── test.sh # verifier script — writes reward to /logs/verifier/reward.txt
112
+ └── Dockerfile # (SEPARATE mode only) verifier container
113
+ ```
114
+
115
+ ## Verifier modes
116
+
117
+ **SHARED** (default): verifier runs inside the agent container after the agent finishes.
118
+
119
+ **SEPARATE**: verifier runs in its own isolated container. Triggered when `handler.verifier_dockerfile()` returns non-None. Harbor re-materializes agent artifacts at their original source paths inside the verifier container — e.g. `/app/submission.csv` in the agent maps to `/app/submission.csv` in the verifier.
120
+
121
+ ## Code style
122
+
123
+ - Simple and concise over clever. Three similar lines beat a premature abstraction.
124
+ - No comments on obvious code. Only comment on non-obvious invariants or workarounds.
125
+ - No dead code. No backwards-compat shims, no unused parameters.
126
+ - Validate at system boundaries only. Trust internal code.
127
+ - Type hints on all public APIs.
128
+ - `harborforge/` must stay benchmark-agnostic. No benchmark-specific strings belong there.
129
+ - Base image: always import `BASE_IMAGE` from `adapters/<name>/base_image.py`. Never hardcode `FROM python:...` in handlers.
130
+
131
+ ## Python conventions
132
+
133
+ - Python 3.13+, managed with `uv`.
134
+ - `pyproject.toml` is the source of truth for dependencies.
135
+ - Tests live in `tests/`, mirror the package structure, use `pytest`.
136
+ - Formatters: `ruff` for linting/formatting, `mypy` for types. Run `just format` before committing.
@@ -0,0 +1,8 @@
1
+ Metadata-Version: 2.4
2
+ Name: harborforge
3
+ Version: 1.0.0
4
+ Requires-Python: ==3.13.*
5
+ Provides-Extra: dev
6
+ Requires-Dist: mypy>=1.10; extra == 'dev'
7
+ Requires-Dist: pytest>=8.0; extra == 'dev'
8
+ Requires-Dist: ruff>=0.8; extra == 'dev'
@@ -0,0 +1 @@
1
+ *
@@ -0,0 +1,6 @@
1
+ """harborforge — abstract toolkit for mapping benchmark datasets to Harbor task directories."""
2
+
3
+ from .handlers.base import DatasetHandler
4
+ from .mapper import DataMapper
5
+
6
+ __all__ = ["DataMapper", "DatasetHandler"]
@@ -0,0 +1,19 @@
1
+ from .base import (
2
+ DatasetHandler,
3
+ case_insensitive_verifier,
4
+ exact_match_verifier,
5
+ list_verifier,
6
+ no_verifier,
7
+ numeric_verifier,
8
+ script_verifier,
9
+ )
10
+
11
+ __all__ = [
12
+ "DatasetHandler",
13
+ "case_insensitive_verifier",
14
+ "exact_match_verifier",
15
+ "list_verifier",
16
+ "no_verifier",
17
+ "numeric_verifier",
18
+ "script_verifier",
19
+ ]
@@ -0,0 +1,231 @@
1
+ """
2
+ DatasetHandler contract.
3
+
4
+ Each dataset subclass must implement:
5
+ - instruction(task_data) → str content for instruction.md (shown to the agent, no answer leakage)
6
+ - test_sh(task_data) → str content for tests/test.sh (must write reward to /logs/verifier/reward.txt)
7
+ - dockerfile(task_data) → str content for environment/Dockerfile
8
+
9
+ Optional overrides:
10
+ - data_files(task_data) → list[tuple[Path, str]] (local_path, dest_in_build_context) pairs
11
+ - task_toml(task_id) → str content for task.toml
12
+ """
13
+
14
+ from abc import ABC, abstractmethod
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+
19
+ class DatasetHandler(ABC):
20
+ dataset_name: str
21
+
22
+ @abstractmethod
23
+ def instruction(self, task_data: dict[str, Any]) -> str:
24
+ """Returns instruction.md content — what the agent sees. Must not include the answer."""
25
+
26
+ @abstractmethod
27
+ def test_sh(self, task_data: dict[str, Any]) -> str:
28
+ """Returns tests/test.sh content. Must write a float reward to /logs/verifier/reward.txt."""
29
+
30
+ @abstractmethod
31
+ def dockerfile(self, task_data: dict[str, Any]) -> str:
32
+ """Returns environment/Dockerfile content."""
33
+
34
+ def verifier_dockerfile(self, task_data: dict[str, Any]) -> str | None:
35
+ """Return a Dockerfile for a SEPARATE verifier container, or None for SHARED (default).
36
+
37
+ When non-None, Harbor routes the trial through SEPARATE mode: the verifier runs in
38
+ its own isolated container and receives agent artifacts as input. Use this when the
39
+ verifier needs different dependencies or ground truth data isolated from the agent.
40
+ """
41
+ return None
42
+
43
+ def setup(self) -> None:
44
+ """Download or prepare local data required by this dataset. No-op by default."""
45
+
46
+ def artifacts(self) -> list[str]:
47
+ """Container paths to capture as artifacts after a trial. Empty by default."""
48
+ return []
49
+
50
+ def verifier_env_keys(self) -> list[str]:
51
+ """Environment variable keys to forward to the SEPARATE verifier container."""
52
+ return []
53
+
54
+ def data_files(self, task_data: dict[str, Any]) -> list[tuple[Path, str]]:
55
+ """
56
+ Returns (local_path, dest_in_build_context) pairs for files to COPY into the image.
57
+ dest_in_build_context is relative to environment/ (e.g. 'data/train.csv').
58
+ """
59
+ return []
60
+
61
+ def task_toml(self, task_id: str) -> str:
62
+ return f"""\
63
+ version = "1.0"
64
+
65
+ [task]
66
+ name = "{task_id}"
67
+
68
+ [metadata]
69
+
70
+ [verifier]
71
+ timeout_sec = 900.0
72
+
73
+ [agent]
74
+ timeout_sec = 900.0
75
+
76
+ [environment]
77
+ build_timeout_sec = 600.0
78
+ """
79
+
80
+
81
+ # ---------------------------------------------------------------------------
82
+ # Verifier shell script templates
83
+ # ---------------------------------------------------------------------------
84
+
85
+
86
+ def exact_match_verifier(answer: Any) -> str:
87
+ """Reward 1.0 if agent output matches expected answer (whitespace-stripped string comparison)."""
88
+ safe = str(answer).replace("'", "'\\''")
89
+ return f"""\
90
+ #!/bin/bash
91
+ set -e
92
+ mkdir -p /logs/verifier
93
+
94
+ EXPECTED='{safe}'
95
+ SUBMISSION=/app/submission.txt
96
+
97
+ if [ ! -f "$SUBMISSION" ]; then
98
+ echo "0" > /logs/verifier/reward.txt
99
+ exit 0
100
+ fi
101
+
102
+ ACTUAL=$(cat "$SUBMISSION" | tr -d '[:space:]')
103
+ EXPECTED_CLEAN=$(echo "$EXPECTED" | tr -d '[:space:]')
104
+
105
+ if [ "$ACTUAL" = "$EXPECTED_CLEAN" ]; then
106
+ echo "1" > /logs/verifier/reward.txt
107
+ else
108
+ echo "0" > /logs/verifier/reward.txt
109
+ fi
110
+ """
111
+
112
+
113
+ def numeric_verifier(answer: Any, tolerance: float = 0.05) -> str:
114
+ """Reward 1.0 if agent output is within `tolerance` (relative) of expected numeric value."""
115
+ safe = str(answer).replace("'", "'\\''")
116
+ return f"""\
117
+ #!/bin/bash
118
+ set -e
119
+ mkdir -p /logs/verifier
120
+
121
+ EXPECTED='{safe}'
122
+
123
+ python3 - "$EXPECTED" <<'PYEOF'
124
+ import sys
125
+
126
+ try:
127
+ expected = float(sys.argv[1])
128
+ with open("/app/submission.txt") as f:
129
+ actual = float(f.read().strip())
130
+ denom = abs(expected) if expected != 0 else 1.0
131
+ reward = 1.0 if abs(actual - expected) / denom <= {tolerance} else 0.0
132
+ except Exception:
133
+ reward = 0.0
134
+
135
+ with open("/logs/verifier/reward.txt", "w") as f:
136
+ f.write(str(reward))
137
+ PYEOF
138
+ """
139
+
140
+
141
+ def list_verifier(answer: Any) -> str:
142
+ """Reward 1.0 if agent output contains all expected items (order-insensitive, case-insensitive).
143
+ Expected answer should be a Python list repr or comma-separated string."""
144
+ safe = str(answer).replace("'", "'\\''")
145
+ return f"""\
146
+ #!/bin/bash
147
+ set -e
148
+ mkdir -p /logs/verifier
149
+
150
+ python3 <<'PYEOF'
151
+ import ast, re
152
+
153
+ EXPECTED_RAW = '{safe}'
154
+ try:
155
+ with open("/app/submission.txt") as f:
156
+ actual_raw = f.read().strip()
157
+ except FileNotFoundError:
158
+ open("/logs/verifier/reward.txt", "w").write("0")
159
+ raise SystemExit
160
+
161
+ def parse_list(s):
162
+ try:
163
+ val = ast.literal_eval(s)
164
+ if isinstance(val, list):
165
+ return {{str(x).strip().lower() for x in val}}
166
+ except Exception:
167
+ pass
168
+ return {{x.strip().lower() for x in re.split(r"[,\\n]+", s) if x.strip()}}
169
+
170
+ expected = parse_list(EXPECTED_RAW)
171
+ actual = parse_list(actual_raw)
172
+ reward = 1.0 if expected and expected == actual else 0.0
173
+ open("/logs/verifier/reward.txt", "w").write(str(reward))
174
+ PYEOF
175
+ """
176
+
177
+
178
+ def case_insensitive_verifier(answer: Any) -> str:
179
+ """Reward 1.0 if agent output matches expected answer, ignoring case and whitespace."""
180
+ safe = str(answer).replace("'", "'\\''")
181
+ return f"""\
182
+ #!/bin/bash
183
+ set -e
184
+ mkdir -p /logs/verifier
185
+
186
+ EXPECTED='{safe}'
187
+ SUBMISSION=/app/submission.txt
188
+
189
+ if [ ! -f "$SUBMISSION" ]; then
190
+ echo "0" > /logs/verifier/reward.txt
191
+ exit 0
192
+ fi
193
+
194
+ ACTUAL=$(cat "$SUBMISSION" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]')
195
+ EXPECTED_CLEAN=$(echo "$EXPECTED" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]')
196
+
197
+ if [ "$ACTUAL" = "$EXPECTED_CLEAN" ]; then
198
+ echo "1" > /logs/verifier/reward.txt
199
+ else
200
+ echo "0" > /logs/verifier/reward.txt
201
+ fi
202
+ """
203
+
204
+
205
+ def script_verifier(script_container_path: str) -> str:
206
+ """Runs a Python eval script already present in the container.
207
+ The script is fully responsible for writing a float reward to /logs/verifier/reward.txt."""
208
+ return f"""\
209
+ #!/bin/bash
210
+ set -e
211
+ mkdir -p /logs/verifier
212
+
213
+ SCRIPT="{script_container_path}"
214
+
215
+ if [ ! -f "$SCRIPT" ]; then
216
+ echo "0" > /logs/verifier/reward.txt
217
+ exit 0
218
+ fi
219
+
220
+ python3 "$SCRIPT" || echo "0" > /logs/verifier/reward.txt
221
+ """
222
+
223
+
224
+ def no_verifier(reason: str = "External evaluation required") -> str:
225
+ """Always writes reward 0. Used for tasks that require external scoring (e.g. leaderboards)."""
226
+ return f"""\
227
+ #!/bin/bash
228
+ # {reason}
229
+ mkdir -p /logs/verifier
230
+ echo "0" > /logs/verifier/reward.txt
231
+ """
@@ -0,0 +1,142 @@
1
+ """
2
+ DataMapper — abstract base for benchmark → Harbor task directory pipelines.
3
+
4
+ Subclass and implement `iter_tasks()`. Override `setup()` to handle data
5
+ downloads or prep. Call `run()` as the standard entry point, or `map()`
6
+ directly for more control.
7
+ """
8
+
9
+ import shutil
10
+ import stat
11
+ from abc import ABC, abstractmethod
12
+ from collections.abc import Iterator
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+ from .handlers.base import DatasetHandler
17
+
18
+
19
+ class DataMapper(ABC):
20
+ """
21
+ Abstract mapper from a benchmark dataset to Harbor task directories.
22
+
23
+ Minimal adapter implementation:
24
+ 1. Implement `iter_tasks()` to yield tasks.
25
+ 2. Optionally override `setup()` to download/prepare raw data.
26
+ 3. Call `run()` as the entry point — it handles setup, mapping, and registry.
27
+ """
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Public API
31
+ # ---------------------------------------------------------------------------
32
+
33
+ def setup(self) -> None:
34
+ """Download or prepare raw data before mapping. No-op by default."""
35
+
36
+ def map(self, output_dir: Path, registry_path: Path | None = None) -> int:
37
+ """
38
+ Write all tasks to output_dir as Harbor task directories.
39
+ Wipes and recreates output_dir on each call.
40
+ If registry_path is given, auto-generates a Harbor registry.json.
41
+ Returns the number of tasks written.
42
+ """
43
+ if output_dir.exists():
44
+ shutil.rmtree(output_dir)
45
+
46
+ total = 0
47
+ current_dataset = ""
48
+ for task_id, dir_name, handler, task_data in self.iter_tasks():
49
+ self._write_task(output_dir / dir_name, task_id, handler, task_data)
50
+ total += 1
51
+ dataset = dir_name.split("/")[0]
52
+ if dataset != current_dataset:
53
+ if current_dataset:
54
+ print()
55
+ current_dataset = dataset
56
+ print(f" ↳ {dataset}", end="", flush=True)
57
+ print(f"\r ↳ {dataset} ({total} written)", end="", flush=True)
58
+ if total:
59
+ print()
60
+
61
+ if registry_path is not None:
62
+ from harborforge.registry import generate_registry
63
+
64
+ counts = generate_registry(output_dir, registry_path)
65
+ n_datasets = len(counts)
66
+ n_tasks = sum(counts.values())
67
+ print(f"✅ {registry_path} — {n_tasks} tasks across {n_datasets} datasets")
68
+
69
+ return total
70
+
71
+ def run(self, output_dir: Path, registry_path: Path | None = None) -> None:
72
+ """
73
+ Standard entry point: setup() → map() → optional registry.
74
+ Adapters call this from __main__.py.
75
+ """
76
+ self.setup()
77
+ total = self.map(output_dir, registry_path)
78
+ print(f"📊 Total tasks written: {total}")
79
+
80
+ # ---------------------------------------------------------------------------
81
+ # Abstract
82
+ # ---------------------------------------------------------------------------
83
+
84
+ @abstractmethod
85
+ def iter_tasks(self) -> Iterator[tuple[str, str, DatasetHandler, dict[str, Any]]]:
86
+ """
87
+ Yield (task_id, dir_name, handler, raw_task_data) for each task.
88
+
89
+ - task_id: unique identifier used in task.toml name field
90
+ - dir_name: relative path under output_dir (e.g. 'daeval/0')
91
+ - handler: DatasetHandler instance for this dataset
92
+ - raw_task_data: raw dict from the source benchmark file
93
+ """
94
+
95
+ # ---------------------------------------------------------------------------
96
+ # Internal
97
+ # ---------------------------------------------------------------------------
98
+
99
+ def _write_task(
100
+ self,
101
+ task_dir: Path,
102
+ task_id: str,
103
+ handler: DatasetHandler,
104
+ task_data: dict[str, Any],
105
+ ) -> None:
106
+ task_dir.mkdir(parents=True, exist_ok=True)
107
+
108
+ (task_dir / "instruction.md").write_text(handler.instruction(task_data), encoding="utf-8")
109
+
110
+ env_dir = task_dir / "environment"
111
+ env_dir.mkdir(exist_ok=True)
112
+
113
+ for local_path, dest_name in handler.data_files(task_data):
114
+ dest = env_dir / dest_name
115
+ dest.parent.mkdir(parents=True, exist_ok=True)
116
+ shutil.copy2(local_path, dest)
117
+
118
+ (env_dir / "Dockerfile").write_text(handler.dockerfile(task_data), encoding="utf-8")
119
+
120
+ # Auto-detect verifier mode: SEPARATE if handler provides a verifier Dockerfile.
121
+ # Harbor uses tests/ as the verifier build context in SEPARATE mode.
122
+ task_toml = handler.task_toml(task_id)
123
+ verifier_df = handler.verifier_dockerfile(task_data)
124
+ if verifier_df is not None:
125
+ task_toml += "\n[verifier.environment]\nbuild_timeout_sec = 300.0\n"
126
+
127
+ (task_dir / "task.toml").write_text(task_toml, encoding="utf-8")
128
+
129
+ tests_dir = task_dir / "tests"
130
+ tests_dir.mkdir(exist_ok=True)
131
+ test_sh = tests_dir / "test.sh"
132
+ test_sh.write_text(handler.test_sh(task_data), encoding="utf-8")
133
+ test_sh.chmod(test_sh.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
134
+
135
+ # SEPARATE mode: write verifier Dockerfile into tests/ (Harbor's verifier build context).
136
+ # COPY test.sh into the image so Harbor can execute /tests/test.sh inside the container.
137
+ if verifier_df is not None:
138
+ verifier_df_with_copy = (
139
+ verifier_df.rstrip()
140
+ + "\nCOPY test.sh /tests/test.sh\nRUN chmod +x /tests/test.sh\n"
141
+ )
142
+ (tests_dir / "Dockerfile").write_text(verifier_df_with_copy, encoding="utf-8")
@@ -0,0 +1,61 @@
1
+ """
2
+ Generate a local Harbor registry.json from a directory of Harbor task directories.
3
+
4
+ The tasks directory is expected to have the structure:
5
+ tasks_dir/
6
+ dataset_a/
7
+ task_0/
8
+ task_1/
9
+ dataset_b/
10
+ task_0/
11
+
12
+ Each top-level subdirectory becomes a named dataset in the registry.
13
+ """
14
+
15
+ import json
16
+ from pathlib import Path
17
+
18
+
19
+ def generate_registry(
20
+ tasks_dir: Path,
21
+ output_path: Path,
22
+ version: str = "1.0",
23
+ description_template: str = "{name} benchmark tasks",
24
+ ) -> dict[str, int]:
25
+ """
26
+ Scan tasks_dir and write a Harbor registry.json to output_path.
27
+
28
+ Returns a dict of {dataset_name: task_count}.
29
+ """
30
+ if not tasks_dir.exists():
31
+ raise FileNotFoundError(f"Tasks directory not found: {tasks_dir}")
32
+
33
+ datasets = []
34
+ counts: dict[str, int] = {}
35
+
36
+ for dataset_dir in sorted(tasks_dir.iterdir()):
37
+ if not dataset_dir.is_dir():
38
+ continue
39
+
40
+ tasks = [
41
+ {"name": task.name, "path": str(task)}
42
+ for task in sorted(dataset_dir.iterdir())
43
+ if task.is_dir()
44
+ ]
45
+
46
+ if not tasks:
47
+ continue
48
+
49
+ name = dataset_dir.name.lower()
50
+ datasets.append(
51
+ {
52
+ "name": name,
53
+ "version": version,
54
+ "description": description_template.format(name=dataset_dir.name),
55
+ "tasks": tasks,
56
+ }
57
+ )
58
+ counts[name] = len(tasks)
59
+
60
+ output_path.write_text(json.dumps(datasets, indent=2))
61
+ return counts