if-split 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- if_split-0.1.0/.github/workflows/ci.yml +47 -0
- if_split-0.1.0/.github/workflows/publish.yml +45 -0
- if_split-0.1.0/.gitignore +44 -0
- if_split-0.1.0/.python-version +1 -0
- if_split-0.1.0/CLAUDE.md +77 -0
- if_split-0.1.0/LICENSE +21 -0
- if_split-0.1.0/PKG-INFO +312 -0
- if_split-0.1.0/PLAN.md +444 -0
- if_split-0.1.0/README.md +293 -0
- if_split-0.1.0/config/default.yaml +56 -0
- if_split-0.1.0/data/cache/.gitkeep +0 -0
- if_split-0.1.0/data/out/.gitkeep +0 -0
- if_split-0.1.0/examples/IF-Split-2026.05.31/README.md +80 -0
- if_split-0.1.0/examples/IF-Split-2026.05.31/STATS.txt +20 -0
- if_split-0.1.0/examples/IF-Split-2026.05.31/config.yaml +32 -0
- if_split-0.1.0/examples/IF-Split-2026.05.31/manifest.json +129 -0
- if_split-0.1.0/examples/IF-Split-2026.05.31/test/metal_test.json +4101 -0
- if_split-0.1.0/examples/IF-Split-2026.05.31/test/nucleotide_test.json +588 -0
- if_split-0.1.0/examples/IF-Split-2026.05.31/test/small_molecule_test.json +3532 -0
- if_split-0.1.0/examples/IF-Split-2026.05.31/test.json +12398 -0
- if_split-0.1.0/examples/IF-Split-2026.05.31/train.json +188674 -0
- if_split-0.1.0/examples/IF-Split-2026.05.31/val.json +13728 -0
- if_split-0.1.0/pyproject.toml +61 -0
- if_split-0.1.0/src/ifsplit/__init__.py +8 -0
- if_split-0.1.0/src/ifsplit/__main__.py +8 -0
- if_split-0.1.0/src/ifsplit/cli.py +317 -0
- if_split-0.1.0/src/ifsplit/cluster.py +130 -0
- if_split-0.1.0/src/ifsplit/config.py +146 -0
- if_split-0.1.0/src/ifsplit/dataset.py +112 -0
- if_split-0.1.0/src/ifsplit/download.py +229 -0
- if_split-0.1.0/src/ifsplit/enumerate.py +111 -0
- if_split-0.1.0/src/ifsplit/hydrate.py +216 -0
- if_split-0.1.0/src/ifsplit/ligands.py +267 -0
- if_split-0.1.0/src/ifsplit/manifest.py +417 -0
- if_split-0.1.0/src/ifsplit/parse.py +111 -0
- if_split-0.1.0/src/ifsplit/rcsb.py +251 -0
- if_split-0.1.0/src/ifsplit/schema.py +241 -0
- if_split-0.1.0/src/ifsplit/split.py +177 -0
- if_split-0.1.0/tests/conftest.py +248 -0
- if_split-0.1.0/tests/test_config.py +106 -0
- if_split-0.1.0/tests/test_download.py +185 -0
- if_split-0.1.0/tests/test_enumerate_lock.py +101 -0
- if_split-0.1.0/tests/test_integration.py +41 -0
- if_split-0.1.0/tests/test_ligands.py +247 -0
- if_split-0.1.0/tests/test_loader.py +142 -0
- if_split-0.1.0/tests/test_pipeline.py +371 -0
- if_split-0.1.0/tests/test_schema.py +121 -0
- if_split-0.1.0/uv.lock +924 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
|
|
12
|
+
concurrency:
|
|
13
|
+
group: ci-${{ github.ref }}
|
|
14
|
+
cancel-in-progress: true
|
|
15
|
+
|
|
16
|
+
jobs:
|
|
17
|
+
test:
|
|
18
|
+
name: lint + test (py${{ matrix.python-version }})
|
|
19
|
+
runs-on: ubuntu-latest
|
|
20
|
+
strategy:
|
|
21
|
+
fail-fast: false
|
|
22
|
+
matrix:
|
|
23
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
24
|
+
steps:
|
|
25
|
+
- uses: actions/checkout@v4
|
|
26
|
+
|
|
27
|
+
- name: Install uv
|
|
28
|
+
uses: astral-sh/setup-uv@v5
|
|
29
|
+
with:
|
|
30
|
+
enable-cache: true
|
|
31
|
+
|
|
32
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
33
|
+
run: uv python install ${{ matrix.python-version }}
|
|
34
|
+
|
|
35
|
+
- name: Sync dependencies (locked)
|
|
36
|
+
run: uv sync --locked --python ${{ matrix.python-version }}
|
|
37
|
+
|
|
38
|
+
- name: Ruff lint
|
|
39
|
+
run: uv run ruff check .
|
|
40
|
+
|
|
41
|
+
- name: Ruff format check
|
|
42
|
+
run: uv run ruff format --check .
|
|
43
|
+
|
|
44
|
+
- name: Pytest (offline)
|
|
45
|
+
run: uv run pytest -q
|
|
46
|
+
# Offline suite only; the one opt-in live RCSB test stays gated behind
|
|
47
|
+
# IFSPLIT_NETWORK_TESTS, which is intentionally not set in CI.
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
# Builds the wheel/sdist and publishes to PyPI when a version tag is pushed,
|
|
4
|
+
# using PyPI Trusted Publishing (OIDC) — no API tokens or stored secrets.
|
|
5
|
+
# One-time setup on pypi.org: Account -> Publishing -> add a trusted publisher
|
|
6
|
+
# Owner: WSobo Repo: IF-Split Workflow: publish.yml Environment: pypi
|
|
7
|
+
on:
|
|
8
|
+
push:
|
|
9
|
+
tags: ["v*"]
|
|
10
|
+
workflow_dispatch:
|
|
11
|
+
|
|
12
|
+
permissions:
|
|
13
|
+
contents: read
|
|
14
|
+
|
|
15
|
+
jobs:
|
|
16
|
+
build:
|
|
17
|
+
name: Build distributions
|
|
18
|
+
runs-on: ubuntu-latest
|
|
19
|
+
steps:
|
|
20
|
+
- uses: actions/checkout@v4
|
|
21
|
+
- name: Install uv
|
|
22
|
+
uses: astral-sh/setup-uv@v5
|
|
23
|
+
- name: Build sdist + wheel
|
|
24
|
+
run: uv build
|
|
25
|
+
- name: Check metadata
|
|
26
|
+
run: uvx twine check dist/*
|
|
27
|
+
- uses: actions/upload-artifact@v4
|
|
28
|
+
with:
|
|
29
|
+
name: dist
|
|
30
|
+
path: dist/
|
|
31
|
+
|
|
32
|
+
publish:
|
|
33
|
+
name: Publish to PyPI
|
|
34
|
+
needs: build
|
|
35
|
+
runs-on: ubuntu-latest
|
|
36
|
+
environment: pypi
|
|
37
|
+
permissions:
|
|
38
|
+
id-token: write # required for Trusted Publishing (OIDC)
|
|
39
|
+
steps:
|
|
40
|
+
- uses: actions/download-artifact@v4
|
|
41
|
+
with:
|
|
42
|
+
name: dist
|
|
43
|
+
path: dist/
|
|
44
|
+
- name: Publish
|
|
45
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
.venv/
|
|
9
|
+
venv/
|
|
10
|
+
|
|
11
|
+
# IF-Split cache: downloaded mmCIF (large). Manifests/locks in data/out/ are kept.
|
|
12
|
+
data/cache/*
|
|
13
|
+
!data/cache/.gitkeep
|
|
14
|
+
|
|
15
|
+
# Build artifacts. The split itself is tiny (train/val/test.json + per-class test
|
|
16
|
+
# lists + the KB-scale manifest.json) and IS committed under examples/. Only the
|
|
17
|
+
# BULKY, regenerable companions are ignored — they belong in a GitHub Release:
|
|
18
|
+
# candidates.jsonl (~335MB), ligands.tiers.json (~24MB), and the per-entry maps.
|
|
19
|
+
candidates.jsonl
|
|
20
|
+
dataset.lock
|
|
21
|
+
ligands.tiers.json
|
|
22
|
+
ligands.classes.json
|
|
23
|
+
clusters.json
|
|
24
|
+
splits.registry.json
|
|
25
|
+
splits.registry.json.tmp*
|
|
26
|
+
# ...but keep the committed example split (these live only under examples/).
|
|
27
|
+
!examples/**/manifest.json
|
|
28
|
+
!examples/**/train.json
|
|
29
|
+
!examples/**/val.json
|
|
30
|
+
!examples/**/test.json
|
|
31
|
+
!examples/**/test/*.json
|
|
32
|
+
|
|
33
|
+
# Tooling
|
|
34
|
+
.pytest_cache/
|
|
35
|
+
.mypy_cache/
|
|
36
|
+
.ruff_cache/
|
|
37
|
+
|
|
38
|
+
# Claude Code: share settings.json (project conventions), ignore local + runtime.
|
|
39
|
+
.claude/*
|
|
40
|
+
!.claude/settings.json
|
|
41
|
+
.claude/settings.local.json
|
|
42
|
+
|
|
43
|
+
# uv: commit uv.lock, ignore the managed virtualenv.
|
|
44
|
+
# (.venv is already covered above.)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.11
|
if_split-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# IF-Split — agent guide
|
|
2
|
+
|
|
3
|
+
Reproducible, date-pinned, ligand-aware train/val/test splitter for the PDB. It
|
|
4
|
+
borrows LigandMPNN's *split logic* but generates it on demand from today's PDB.
|
|
5
|
+
Read [PLAN.md](PLAN.md) for the design rationale and [README.md](README.md) for
|
|
6
|
+
usage. This file is the orientation for working in the repo.
|
|
7
|
+
|
|
8
|
+
## The one load-bearing idea
|
|
9
|
+
|
|
10
|
+
**The split is computed from metadata + sequences only — `build` never downloads
|
|
11
|
+
structure coordinates.** Everything needed (resolution, method, release date,
|
|
12
|
+
residue counts, per-entity sequences, ligand chem-comp + bound-component signals,
|
|
13
|
+
RCSB cluster membership) comes from the RCSB Search + Data APIs. Coordinates
|
|
14
|
+
(mmCIF) are large and only needed downstream, so `fetch` (Stage 2) is optional.
|
|
15
|
+
Keep it that way: do not add coordinate access to the build path.
|
|
16
|
+
|
|
17
|
+
## Environment (WSL over a Windows UNC mount)
|
|
18
|
+
|
|
19
|
+
- Code lives in WSL at `~/projects/IF-Split`, opened over `\\wsl.localhost\ubuntu\...`.
|
|
20
|
+
- **Run everything through WSL**, not the Bash tool's git-bash (that's Windows
|
|
21
|
+
Python and lacks the deps): `wsl -d ubuntu bash -lc '...'`.
|
|
22
|
+
- The deps live in a **uv** venv; `uv` is at `$HOME/.local/bin`, so prefix:
|
|
23
|
+
`export PATH="$HOME/.local/bin:$PATH"`.
|
|
24
|
+
- Use **single quotes** for `bash -lc '...'`. Avoid backticks and `python3 -c "..."`
|
|
25
|
+
inside it — nested quotes/backticks get shell-evaluated and corrupt the command
|
|
26
|
+
(this has mangled commit messages and grep filters). For commit messages, write
|
|
27
|
+
the text to a file and use `git commit -F file`.
|
|
28
|
+
- **Do not put a command that may exit nonzero in a parallel tool batch** — one
|
|
29
|
+
failure cancels every sibling call in that batch. Run risky/dependent commands
|
|
30
|
+
one at a time.
|
|
31
|
+
|
|
32
|
+
## Commands
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
wsl -d ubuntu bash -lc 'cd ~/projects/IF-Split && export PATH="$HOME/.local/bin:$PATH" && uv run pytest -q'
|
|
36
|
+
uv run ruff check . # lint (must pass)
|
|
37
|
+
uv run ruff format . # format
|
|
38
|
+
uv run if-split build --limit 50 --out /tmp/ifs # dev build (small, live RCSB)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
- `uv sync` sets up the env; `uv sync --extra mlops` adds pyarrow for `fetch`'s
|
|
42
|
+
parquet index. Dev tools (ruff, pytest) are a PEP 735 dependency-group.
|
|
43
|
+
- The offline test suite needs no network. One opt-in live test runs only with
|
|
44
|
+
`IFSPLIT_NETWORK_TESTS=1`.
|
|
45
|
+
|
|
46
|
+
## Architecture (src/ifsplit/, one module per stage)
|
|
47
|
+
|
|
48
|
+
`enumerate.py`+`rcsb.py` (Stage 1, Search+Data API → candidates.jsonl) →
|
|
49
|
+
`parse.py` (3, metadata filters) → `ligands.py` (4, confidence tiering) →
|
|
50
|
+
`cluster.py` (5, union-find components) → `split.py` (6, deterministic hash) →
|
|
51
|
+
`manifest.py` (7, lock + manifest + registry, verify/stats) → `dataset.py` (8,
|
|
52
|
+
loader). `download.py`+`hydrate.py` are the optional Stage 2 `fetch`.
|
|
53
|
+
|
|
54
|
+
Invariants that must not regress:
|
|
55
|
+
- **Determinism:** same config → byte-identical `manifest.json` (no wall-clock
|
|
56
|
+
fields). `test_manifest_is_deterministic` guards this.
|
|
57
|
+
- **No cross-split leakage:** sequence clusters joined by a shared multi-chain
|
|
58
|
+
entry are union-find–merged into one component; a component maps to exactly one
|
|
59
|
+
split, so overlap is impossible by construction. `check_no_leakage` is a real
|
|
60
|
+
invariant (not a tautology) — keep it that way.
|
|
61
|
+
- **Growth stability:** a cluster/component's split is `hash(salt + canonical_key)`
|
|
62
|
+
into cumulative fractions, keyed on the global-min member id (not RCSB's volatile
|
|
63
|
+
integer id). A larger snapshot only *adds* components; `splits.registry.json`
|
|
64
|
+
pins prior assignments.
|
|
65
|
+
- **Annotate, never destroy:** ligand quality is a per-component *tier*
|
|
66
|
+
(functional / ambiguous / artifact) in the manifest; structures are never
|
|
67
|
+
dropped for ligand quality. Class labels derive from the functional tier.
|
|
68
|
+
- **PDB-ID compatibility:** store entry/entity ids verbatim from `rcsb_id` (legacy
|
|
69
|
+
`4HHB` and extended `pdb_xxxxxxxx`); never slice/length-validate/case-fold them.
|
|
70
|
+
|
|
71
|
+
## Conventions
|
|
72
|
+
|
|
73
|
+
- Python ≥ 3.11, uv + ruff (line length 100). Keep ruff clean and tests green
|
|
74
|
+
before committing.
|
|
75
|
+
- Don't commit on `main` without the user asking; end commit messages with the
|
|
76
|
+
`Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>` trailer.
|
|
77
|
+
- GitHub remote: `github.com/WSobo/IF-Split` (public).
|
if_split-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 WSobo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
if_split-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: if-split
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Reproducible, date-pinned, ligand-aware train/val/test splitter for the PDB (LigandMPNN-style).
|
|
5
|
+
Author: WSobo
|
|
6
|
+
License: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: dataset,ligandmpnn,mmseqs2,pdb,protein,reproducibility
|
|
9
|
+
Requires-Python: >=3.11
|
|
10
|
+
Requires-Dist: gemmi>=0.6.5
|
|
11
|
+
Requires-Dist: httpx>=0.27
|
|
12
|
+
Requires-Dist: pydantic>=2.6
|
|
13
|
+
Requires-Dist: pyyaml>=6.0
|
|
14
|
+
Provides-Extra: mlops
|
|
15
|
+
Requires-Dist: pyarrow>=15; extra == 'mlops'
|
|
16
|
+
Provides-Extra: torch
|
|
17
|
+
Requires-Dist: torch>=2.2; extra == 'torch'
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# IF-Split
|
|
21
|
+
|
|
22
|
+
**A reproducible, date-pinned, ligand-aware train/val/test splitter for the PDB.**
|
|
23
|
+
|
|
24
|
+
IF-Split borrows the *split logic* of LigandMPNN (Dauparas et al., *Nature
|
|
25
|
+
Methods* 2025) — cluster proteins at 30% sequence identity, partition so no
|
|
26
|
+
cluster spans two splits, categorize the test set by ligand class — but instead
|
|
27
|
+
of inheriting a frozen 2022 snapshot it generates the split **on demand from
|
|
28
|
+
today's PDB**, and emits a lock file so a collaborator can reproduce the exact
|
|
29
|
+
dataset later. See [PLAN.md](PLAN.md) for the full design spec.
|
|
30
|
+
|
|
31
|
+
It is built entirely on RCSB **metadata** (the Search + Data APIs): **no
|
|
32
|
+
structure coordinates are downloaded** to build a split — only tiny per-entry
|
|
33
|
+
records and sequences. Coordinates are an optional, downstream concern.
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Why it's different
|
|
38
|
+
|
|
39
|
+
| | |
|
|
40
|
+
|---|---|
|
|
41
|
+
| **Fresh** | Builds from the current PDB, not a years-old frozen copy. |
|
|
42
|
+
| **Reproducible** | A `dataset.lock` pins the snapshot; `verify` re-derives it and reports any drift. |
|
|
43
|
+
| **Cheap** | Metadata-only — a split is megabytes of JSON, not a terabyte of mmCIF. |
|
|
44
|
+
| **Honest about quality** | Every ligand is tiered (`functional` / `ambiguous` / `artifact`) with a reason; nothing is silently dropped. |
|
|
45
|
+
|
|
46
|
+
### Two reproducibility guarantees
|
|
47
|
+
|
|
48
|
+
1. **Snapshot by release date, not query time.** Entries are selected by
|
|
49
|
+
`release_date <= snapshot_date`. Re-running with the same `snapshot_date`
|
|
50
|
+
yields the same candidate set no matter *when* you run it (obsoleted entries
|
|
51
|
+
are tracked, not silently dropped).
|
|
52
|
+
2. **Deterministic cluster → split assignment.** A cluster's split is decided by
|
|
53
|
+
hashing a stable cluster key into the cumulative split fractions — independent
|
|
54
|
+
of how many other clusters exist. Existing clusters never move when the PDB
|
|
55
|
+
grows, which is what prevents train/test leakage on regeneration. A
|
|
56
|
+
`splits.registry.json` pins prior assignments to make this exact even across
|
|
57
|
+
re-clustering.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Install
|
|
62
|
+
|
|
63
|
+
Requires Python ≥ 3.11 and [`uv`](https://docs.astral.sh/uv/). `build` needs
|
|
64
|
+
only network access to RCSB — no external binaries. (The optional `mmseqs2`
|
|
65
|
+
clustering backend and the optional coordinate/featurization path via `gemmi`
|
|
66
|
+
are Linux-native, so run under Linux/WSL if you use them.)
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
uv sync # creates .venv from uv.lock, installs deps + dev tools (ruff, pytest)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
`uv.lock` is committed, so environments are reproducible.
|
|
73
|
+
|
|
74
|
+
## Quickstart
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
# Build the full split from today's PDB (metadata only).
|
|
78
|
+
uv run if-split build --config config/default.yaml --out data/out
|
|
79
|
+
|
|
80
|
+
# Dev: cap to the first N candidates (by sorted entry id — still reproducible).
|
|
81
|
+
uv run if-split build --limit 50 --out /tmp/ifs
|
|
82
|
+
|
|
83
|
+
# Summarize a build: split sizes, per-class test counts, curation tiers.
|
|
84
|
+
uv run if-split stats data/out/manifest.json
|
|
85
|
+
|
|
86
|
+
# Reproduce-check: re-derive from a lock and report drift vs the live PDB.
|
|
87
|
+
uv run if-split verify data/out/dataset.lock
|
|
88
|
+
|
|
89
|
+
# Growth-stable regeneration: pin prior cluster→split assignments.
|
|
90
|
+
uv run if-split build --registry data/out/splits.registry.json --out data/out2
|
|
91
|
+
|
|
92
|
+
# OPTIONAL: download the actual structures for a built split (see below).
|
|
93
|
+
uv run if-split fetch data/out/manifest.json --split test --out data/structures
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Outputs (`--out` directory)
|
|
97
|
+
|
|
98
|
+
| File | Purpose |
|
|
99
|
+
|---|---|
|
|
100
|
+
| `candidates.jsonl` | The snapshot definition — one canonical JSON record per entry. Hashed into the lock. |
|
|
101
|
+
| `dataset.lock` | Reproduction anchor: embedded config + candidates SHA-256 + entry list. |
|
|
102
|
+
| `manifest.json` | Human-facing run record: per-split entry lists, ligand classes + tiers, per-class (and ambiguous) counts, drop log, cluster/leakage stats, entry→cluster map. |
|
|
103
|
+
| `splits.registry.json` | `cluster key → split`, for growth-stable regeneration. |
|
|
104
|
+
|
|
105
|
+
## Downloading structures (`fetch`)
|
|
106
|
+
|
|
107
|
+
`build` produces a tiny, coordinate-free split. When you actually want the mmCIF
|
|
108
|
+
files — to featurize or train — `fetch` hydrates a *built manifest* into a
|
|
109
|
+
clean, ML-ready tree. It is **opt-in and downstream**: nothing about a split
|
|
110
|
+
requires coordinates.
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
# Scope is explicit by design (no accidental terabyte): choose splits or --all.
|
|
114
|
+
uv run if-split fetch data/out/manifest.json --split test # just test
|
|
115
|
+
uv run if-split fetch data/out/manifest.json --split train --split val # repeatable
|
|
116
|
+
uv run if-split fetch data/out/manifest.json --all --yes --workers 16 # everything
|
|
117
|
+
uv run if-split fetch data/out/manifest.json --all --asymmetric-unit # AU not assembly 1
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
`fetch` prints an estimated download size first and refuses pulls over ~1000
|
|
121
|
+
structures without `--yes`. It is **resumable** (existing, valid files are
|
|
122
|
+
skipped) and parallel (`--workers`).
|
|
123
|
+
|
|
124
|
+
### Layout — browsable *and* scalable
|
|
125
|
+
|
|
126
|
+
Files are split-partitioned (so you can `ls` a split) and sharded by the PDB
|
|
127
|
+
"divided" scheme — the middle two characters of the entry id — so no single
|
|
128
|
+
directory holds an unwieldy number of files:
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
data/structures/
|
|
132
|
+
structures/
|
|
133
|
+
train/ hh/4hhb-assembly1.cif.gz 01/101m-assembly1.cif.gz 02/102l-… 102m-…
|
|
134
|
+
val/ …
|
|
135
|
+
test/ 0a/10ad-assembly1.cif.gz
|
|
136
|
+
index.jsonl # one row per structure (zero-dep, greppable)
|
|
137
|
+
index.parquet # same, columnar (written if pyarrow is installed)
|
|
138
|
+
manifest.json # copy of the source split manifest
|
|
139
|
+
DATASET_CARD.md # provenance + how-to-load
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
The **index** is the ML entry point — one row per structure with `entry_id`,
|
|
143
|
+
`split`, `path`, **`sha256`** (integrity + dedupe), `cluster` (for
|
|
144
|
+
cluster-balanced batches), and `ligand_classes` / `ligand_tiers`:
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
import pandas as pd
|
|
148
|
+
df = pd.read_parquet("data/structures/index.parquet") # or read_json(..., lines=True)
|
|
149
|
+
train = df[df.split == "train"]
|
|
150
|
+
metal_train = train[train.ligand_classes.str.contains("metal")]
|
|
151
|
+
# de-redundified epoch: one structure per sequence cluster
|
|
152
|
+
epoch = train.sort_values("entry_id").groupby("cluster").head(1)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
The columnar `index.parquet` needs `pyarrow`: `uv sync --extra mlops` (the
|
|
156
|
+
zero-dependency `index.jsonl` is always written regardless).
|
|
157
|
+
|
|
158
|
+
## How it works
|
|
159
|
+
|
|
160
|
+
A `build` runs eight stages; none touch coordinates.
|
|
161
|
+
|
|
162
|
+
| Stage | Module | What it does |
|
|
163
|
+
|---|---|---|
|
|
164
|
+
| 1 — enumerate | `enumerate.py`, `rcsb.py` | RCSB Search → entry IDs; Data API (GraphQL, batched) → sequences, ligands, residue counts, cluster membership → `candidates.jsonl`. |
|
|
165
|
+
| 3 — filter | `parse.py` | Drop no-protein / no-sequence / oversized entries (assembly-1 residue count vs `max_total_residues`), plus optional wwPDB validation-report quality caps (clashscore, R-free, Ramachandran/rotamer/RSRZ) — all from metadata. Every drop is logged with its reason. |
|
|
166
|
+
| 4 — ligands | `ligands.py` | Tier each non-protein component `functional`/`ambiguous`/`artifact`; derive class labels (metal / small-molecule / nucleotide). Nucleotide is functional only with a verified protein↔NA assembly interface. **Annotate, never drop.** |
|
|
167
|
+
| 5 — cluster | `cluster.py` | Group protein entities by RCSB precomputed cluster id at `identity_threshold`; canonical key = smallest member id. |
|
|
168
|
+
| 6 — split | `split.py` | Deterministic hash → train/val/test; assert no cluster spans two splits; audit residual secondary-chain overlap. |
|
|
169
|
+
| 7 — manifest | `manifest.py` | Emit lock + manifest + registry (all deterministic, no wall-clock fields). |
|
|
170
|
+
| 8 — loader | `dataset.py` | Read a manifest into train/val/test views with cluster-balanced sampling. |
|
|
171
|
+
| 2 — fetch *(opt-in)* | `download.py`, `hydrate.py` | Download mmCIF for a built manifest into a sharded, indexed, ML-ready tree. |
|
|
172
|
+
|
|
173
|
+
> Stage 2 (mmCIF coordinate download) is **optional and downstream** — only
|
|
174
|
+
> needed to extract ligand context or feed a model, never to build a split. See
|
|
175
|
+
> [Downloading structures](#downloading-structures-fetch) for the `fetch` command.
|
|
176
|
+
|
|
177
|
+
### Structure quality (validation report)
|
|
178
|
+
|
|
179
|
+
For the highest-quality backbones, `build` can filter on the **wwPDB validation
|
|
180
|
+
report** — fetched as metadata, so the no-download invariant still holds. The
|
|
181
|
+
metrics come straight from the deposited report:
|
|
182
|
+
|
|
183
|
+
| Cap | Metric | Applies to |
|
|
184
|
+
|---|---|---|
|
|
185
|
+
| `max_clashscore` | all-atom clashscore | X-ray + cryo-EM |
|
|
186
|
+
| `max_ramachandran_outlier_pct` | % backbone Ramachandran outliers | X-ray + cryo-EM |
|
|
187
|
+
| `max_rotamer_outlier_pct` | % sidechain rotamer outliers | X-ray + cryo-EM |
|
|
188
|
+
| `max_rfree` | R-free (DCC) | X-ray |
|
|
189
|
+
| `max_rsrz_outlier_pct` | % real-space-R Z-score outliers | X-ray |
|
|
190
|
+
|
|
191
|
+
Two rules keep it honest: a cap fires **only when the metric is present**, so a
|
|
192
|
+
cryo-EM entry is never dropped for a missing R-free; and every cap is **off by
|
|
193
|
+
default**, so the snapshot is unchanged until you opt in. `require_validation_report`
|
|
194
|
+
drops entries with no report at all. Each drop is logged with its reason and
|
|
195
|
+
value (e.g. `clashscore_too_high`) and is summarised by `if-split stats`.
|
|
196
|
+
|
|
197
|
+
> Strict starting point: `max_clashscore: 40`, `max_rfree: 0.30`,
|
|
198
|
+
> `max_ramachandran_outlier_pct: 1.0`. Some classic low-quality depositions drop
|
|
199
|
+
> out — e.g. the 1984 entry `4HHB` has a clashscore of 142.
|
|
200
|
+
|
|
201
|
+
### Ligand quality: annotate, don't destroy
|
|
202
|
+
|
|
203
|
+
IF-Split is a *tool*, not one frozen dataset, so it won't make an irreversible
|
|
204
|
+
quality call for you. Every non-protein component is tiered, with a
|
|
205
|
+
machine-readable reason, from RCSB metadata signals:
|
|
206
|
+
|
|
207
|
+
| Tier | Meaning | Example reasons |
|
|
208
|
+
|---|---|---|
|
|
209
|
+
| `functional` | Real ligand/site → gets a class label | bound to protein (`nonpolymer_bound_components`) or has measured binding affinity |
|
|
210
|
+
| `ambiguous` | Present but uncorroborated → reported, **not** labelled | `metal_unbound`, `ligand_unbound` |
|
|
211
|
+
| `artifact` | Buffer / counterion / purification tag → excluded from labels | `additive`, `counterion`, `histag_metal` |
|
|
212
|
+
|
|
213
|
+
**Holo gating (metadata-only).** Presence isn't enough. A small molecule or metal
|
|
214
|
+
is `functional` only if RCSB reports it *contacting* the protein (`bound_components`)
|
|
215
|
+
or it has a measured binding affinity; an unbound one is `ambiguous`. A DNA/RNA
|
|
216
|
+
chain is `functional` *nucleotide* only when the biological assembly has a verified
|
|
217
|
+
protein↔nucleic-acid interface (`rcsb_interface_info.polymer_composition == "Protein/NA"`)
|
|
218
|
+
— a co-deposited but non-contacting oligo is reported `ambiguous`, never silently
|
|
219
|
+
labelled. (Interfaces are RCSB-computed metadata, available for X-ray *and* cryo-EM,
|
|
220
|
+
so no coordinates are downloaded.)
|
|
221
|
+
|
|
222
|
+
The His-tag/Ni curation catches a known blemish in the LigandMPNN metal set:
|
|
223
|
+
structures whose only "metal site" is a poly-His tag chelating Ni/Co from
|
|
224
|
+
affinity purification. Live examples from a real build: `101M → {HEM:
|
|
225
|
+
functional, SO4: artifact}`, `102L → {BME: artifact, CL: artifact}`.
|
|
226
|
+
|
|
227
|
+
Crucially, **the structure always stays in its split** — a protein with a junk
|
|
228
|
+
ion is still a good backbone; we just don't label the junk. A consumer wanting
|
|
229
|
+
"pristine metal sites only" vs "maximum scale, I'll filter myself" changes a
|
|
230
|
+
threshold, not the build. The same per-component tier is what a downstream
|
|
231
|
+
featurizer reads to decide what counts as real ligand context.
|
|
232
|
+
|
|
233
|
+
### Test-set representation
|
|
234
|
+
|
|
235
|
+
The split is a pure deterministic hash, so the test set's ligand mix is reported
|
|
236
|
+
but not forced by default: `manifest.json` carries per-split, per-class
|
|
237
|
+
`functional` counts plus `ambiguous` counts, so under-representation is visible.
|
|
238
|
+
An opt-in `--enforce-minimums` top-up (recruit `functional`-only ligand clusters
|
|
239
|
+
into test in deterministic order) is scoped for a future release.
|
|
240
|
+
|
|
241
|
+
### Using a split (loader)
|
|
242
|
+
|
|
243
|
+
```python
|
|
244
|
+
from ifsplit.dataset import load_dataset
|
|
245
|
+
|
|
246
|
+
ds = load_dataset("data/out/manifest.json")
|
|
247
|
+
print(len(ds.train), len(ds.val), len(ds.test))
|
|
248
|
+
|
|
249
|
+
# Ligand-class views.
|
|
250
|
+
metal_test = ds.test.with_class("metal")
|
|
251
|
+
|
|
252
|
+
# Cluster-balanced sampling: one representative per sequence cluster per epoch,
|
|
253
|
+
# so over-represented folds (lysozyme, common kinases) don't dominate.
|
|
254
|
+
for epoch in range(3):
|
|
255
|
+
batch_ids = ds.train.sample_by_cluster(seed=epoch)
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
## Configuration
|
|
259
|
+
|
|
260
|
+
Everything that affects the output lives in one YAML file
|
|
261
|
+
([`config/default.yaml`](config/default.yaml)); its canonical hash is embedded
|
|
262
|
+
in every manifest, so two builds with the same hash used identical settings.
|
|
263
|
+
|
|
264
|
+
| Key | Default | Meaning |
|
|
265
|
+
|---|---|---|
|
|
266
|
+
| `snapshot_date` | `2026-05-30` | `release_date <= this` — the reproducibility anchor. |
|
|
267
|
+
| `experimental_methods` | X-ray, EM | Allowed `exptl.method` values. |
|
|
268
|
+
| `resolution_max_A` | `3.5` | Resolution cutoff. |
|
|
269
|
+
| `max_total_residues` | `5999` | Size cap (LigandMPNN used `< 6000`). |
|
|
270
|
+
| `excluded_het` | waters + common ions | Extra components forced to `artifact`. |
|
|
271
|
+
| `use_biological_assembly` | `true` | Count residues from assembly 1, not the deposited asymmetric unit. |
|
|
272
|
+
| `purification_metals` | `[NI, CO]` | Metals treated as IMAC tags; `[]` disables the heuristic. |
|
|
273
|
+
| `histag_min_run` | `6` | His-run length that marks a purification tag. |
|
|
274
|
+
| `exclude_purification_artifacts` | `true` | Demote His-tag metals to `artifact`. |
|
|
275
|
+
| `identity_threshold` | `0.30` | Clustering cutoff (RCSB levels: 30/50/70/90/95/100). |
|
|
276
|
+
| `clustering_backend` | `precomputed` | `precomputed` (RCSB clusters) or `mmseqs2` (run your own). |
|
|
277
|
+
| `split_fractions` | 0.80 / 0.10 / 0.10 | train / val / test. |
|
|
278
|
+
| `split_salt` | `snapsplit-v1` | Bump to intentionally reshuffle the split. |
|
|
279
|
+
| `max_clashscore`, `max_rfree`, `max_ramachandran_outlier_pct`, `max_rotamer_outlier_pct`, `max_rsrz_outlier_pct`, `require_validation_report` | off | Optional validation-report quality caps — see [Structure quality](#structure-quality-validation-report). |
|
|
280
|
+
| `ligand_context_radius_A`, `max_ligand_atoms` | `8.0`, `25` | Featurization only (not part of the split). |
|
|
281
|
+
|
|
282
|
+
## Develop
|
|
283
|
+
|
|
284
|
+
```bash
|
|
285
|
+
uv run pytest # tests (offline; 1 opt-in network test, see below)
|
|
286
|
+
uv run ruff check . # lint
|
|
287
|
+
uv run ruff format . # format
|
|
288
|
+
|
|
289
|
+
# Run the opt-in live RCSB round-trip test.
|
|
290
|
+
IFSPLIT_NETWORK_TESTS=1 uv run pytest tests/test_integration.py
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
## Layout
|
|
294
|
+
|
|
295
|
+
```
|
|
296
|
+
config/default.yaml # single source of truth for a run (hashed into the manifest)
|
|
297
|
+
src/ifsplit/ # config.py + one module per pipeline stage
|
|
298
|
+
enumerate.py rcsb.py # Stage 1: RCSB Search + Data API
|
|
299
|
+
parse.py # Stage 3: metadata filters
|
|
300
|
+
ligands.py # Stage 4: ligand tiering + classification
|
|
301
|
+
cluster.py split.py # Stages 5-6: clustering + deterministic split
|
|
302
|
+
manifest.py # Stage 7: lock + manifest + registry, verify/stats
|
|
303
|
+
dataset.py # Stage 8: loader + cluster-balanced sampling
|
|
304
|
+
download.py # Stage 2: optional mmCIF fetch (featurization only)
|
|
305
|
+
data/cache/ # downloaded mmCIF, if ever used (gitignored)
|
|
306
|
+
data/out/ # generated manifests + lock files
|
|
307
|
+
tests/
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
## License
|
|
311
|
+
|
|
312
|
+
MIT — see [LICENSE](LICENSE).
|