if-split 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {if_split-0.1.0 → if_split-0.2.0}/PKG-INFO +87 -15
- {if_split-0.1.0 → if_split-0.2.0}/README.md +86 -14
- {if_split-0.1.0 → if_split-0.2.0}/config/default.yaml +12 -4
- {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/README.md +19 -15
- if_split-0.2.0/examples/IF-Split-2026.05.31/STATS.txt +20 -0
- {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/manifest.json +35 -32
- {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/test/metal_test.json +16 -103
- {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/test/small_molecule_test.json +3183 -0
- {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/test.json +13 -0
- {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/train.json +14 -24
- {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/val.json +8 -14
- {if_split-0.1.0 → if_split-0.2.0}/pyproject.toml +1 -1
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/__init__.py +1 -1
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/cli.py +48 -0
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/config.py +71 -4
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/ligands.py +99 -23
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/manifest.py +1 -1
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/rcsb.py +8 -1
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/schema.py +13 -1
- {if_split-0.1.0 → if_split-0.2.0}/tests/test_config.py +51 -0
- {if_split-0.1.0 → if_split-0.2.0}/tests/test_ligands.py +112 -17
- {if_split-0.1.0 → if_split-0.2.0}/uv.lock +1 -1
- if_split-0.1.0/examples/IF-Split-2026.05.31/STATS.txt +0 -20
- {if_split-0.1.0 → if_split-0.2.0}/.github/workflows/ci.yml +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/.github/workflows/publish.yml +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/.gitignore +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/.python-version +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/CLAUDE.md +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/LICENSE +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/PLAN.md +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/data/cache/.gitkeep +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/data/out/.gitkeep +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/config.yaml +0 -0
- /if_split-0.1.0/examples/IF-Split-2026.05.31/test/nucleotide_test.json → /if_split-0.2.0/examples/IF-Split-2026.05.31/test/nucleic_acid_test.json +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/__main__.py +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/cluster.py +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/dataset.py +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/download.py +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/enumerate.py +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/hydrate.py +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/parse.py +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/split.py +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/tests/conftest.py +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/tests/test_download.py +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/tests/test_enumerate_lock.py +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/tests/test_integration.py +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/tests/test_loader.py +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/tests/test_pipeline.py +0 -0
- {if_split-0.1.0 → if_split-0.2.0}/tests/test_schema.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: if-split
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Reproducible, date-pinned, ligand-aware train/val/test splitter for the PDB (LigandMPNN-style).
|
|
5
5
|
Author: WSobo
|
|
6
6
|
License: MIT
|
|
@@ -19,6 +19,10 @@ Description-Content-Type: text/markdown
|
|
|
19
19
|
|
|
20
20
|
# IF-Split
|
|
21
21
|
|
|
22
|
+
[](https://github.com/WSobo/IF-Split/actions/workflows/ci.yml)
|
|
23
|
+
[](https://pypi.org/project/if-split/)
|
|
24
|
+
[](https://pypi.org/project/if-split/)
|
|
25
|
+
|
|
22
26
|
**A reproducible, date-pinned, ligand-aware train/val/test splitter for the PDB.**
|
|
23
27
|
|
|
24
28
|
IF-Split borrows the *split logic* of LigandMPNN (Dauparas et al., *Nature
|
|
@@ -60,16 +64,23 @@ records and sequences. Coordinates are an optional, downstream concern.
|
|
|
60
64
|
|
|
61
65
|
## Install
|
|
62
66
|
|
|
63
|
-
Requires Python ≥ 3.11
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
+
Requires Python ≥ 3.11. `build` needs only network access to RCSB — no external
|
|
68
|
+
binaries.
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
pip install if-split # from PyPI
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Or for development, with [`uv`](https://docs.astral.sh/uv/):
|
|
67
75
|
|
|
68
76
|
```bash
|
|
77
|
+
git clone https://github.com/WSobo/IF-Split && cd IF-Split
|
|
69
78
|
uv sync # creates .venv from uv.lock, installs deps + dev tools (ruff, pytest)
|
|
70
79
|
```
|
|
71
80
|
|
|
72
|
-
`uv.lock` is committed, so environments are reproducible.
|
|
81
|
+
`uv.lock` is committed, so dev environments are reproducible. (The optional
|
|
82
|
+
coordinate/featurization path via `gemmi` is Linux-native, so run under
|
|
83
|
+
Linux/WSL if you use `fetch`.)
|
|
73
84
|
|
|
74
85
|
## Quickstart
|
|
75
86
|
|
|
@@ -91,6 +102,9 @@ uv run if-split build --registry data/out/splits.registry.json --out data/out2
|
|
|
91
102
|
|
|
92
103
|
# OPTIONAL: download the actual structures for a built split (see below).
|
|
93
104
|
uv run if-split fetch data/out/manifest.json --split test --out data/structures
|
|
105
|
+
|
|
106
|
+
# Emit a portable, shareable split spec (see "Sharing a split spec" below).
|
|
107
|
+
uv run if-split spec data/out/manifest.json --name my-split --out my-split.ifsplit.yaml
|
|
94
108
|
```
|
|
95
109
|
|
|
96
110
|
### Outputs (`--out` directory)
|
|
@@ -163,7 +177,7 @@ A `build` runs eight stages; none touch coordinates.
|
|
|
163
177
|
|---|---|---|
|
|
164
178
|
| 1 — enumerate | `enumerate.py`, `rcsb.py` | RCSB Search → entry IDs; Data API (GraphQL, batched) → sequences, ligands, residue counts, cluster membership → `candidates.jsonl`. |
|
|
165
179
|
| 3 — filter | `parse.py` | Drop no-protein / no-sequence / oversized entries (assembly-1 residue count vs `max_total_residues`), plus optional wwPDB validation-report quality caps (clashscore, R-free, Ramachandran/rotamer/RSRZ) — all from metadata. Every drop is logged with its reason. |
|
|
166
|
-
| 4 — ligands | `ligands.py` | Tier each non-protein component `functional`/`ambiguous`/`artifact`; derive class labels (metal / small-molecule /
|
|
180
|
+
| 4 — ligands | `ligands.py` | Tier each non-protein component `functional`/`ambiguous`/`artifact`; derive class labels (metal / small-molecule / nucleic-acid). `nucleic_acid` = a protein↔DNA/RNA *complex* (verified assembly interface), **not** a bound mononucleotide. **Annotate, never drop.** |
|
|
167
181
|
| 5 — cluster | `cluster.py` | Group protein entities by RCSB precomputed cluster id at `identity_threshold`; canonical key = smallest member id. |
|
|
168
182
|
| 6 — split | `split.py` | Deterministic hash → train/val/test; assert no cluster spans two splits; audit residual secondary-chain overlap. |
|
|
169
183
|
| 7 — manifest | `manifest.py` | Emit lock + manifest + registry (all deterministic, no wall-clock fields). |
|
|
@@ -213,16 +227,30 @@ machine-readable reason, from RCSB metadata signals:
|
|
|
213
227
|
**Holo gating (metadata-only).** Presence isn't enough. A small molecule or metal
|
|
214
228
|
is `functional` only if RCSB reports it *contacting* the protein (`bound_components`)
|
|
215
229
|
or it has a measured binding affinity; an unbound one is `ambiguous`. A DNA/RNA
|
|
216
|
-
chain is `functional`
|
|
217
|
-
protein↔nucleic-acid interface (`
|
|
218
|
-
|
|
230
|
+
chain is `functional` `nucleic_acid` only when the biological assembly has a verified
|
|
231
|
+
protein↔nucleic-acid interface (`num_prot_na_interface_entities > 0`) — a
|
|
232
|
+
co-deposited but non-contacting oligo is reported `ambiguous`, never silently
|
|
219
233
|
labelled. (Interfaces are RCSB-computed metadata, available for X-ray *and* cryo-EM,
|
|
220
234
|
so no coordinates are downloaded.)
|
|
221
235
|
|
|
236
|
+
> The `nucleic_acid` class is the protein–nucleic-acid **complex** category (DNA/RNA
|
|
237
|
+
> polymer chains), matching LigandMPNN's "nucleotide" split. Bound *mononucleotide*
|
|
238
|
+
> ligands (ATP, GTP, NAD, SAM, …) are not this class — they fall under
|
|
239
|
+
> `small_molecule`.
|
|
240
|
+
|
|
222
241
|
The His-tag/Ni curation catches a known blemish in the LigandMPNN metal set:
|
|
223
242
|
structures whose only "metal site" is a poly-His tag chelating Ni/Co from
|
|
224
|
-
affinity purification.
|
|
225
|
-
|
|
243
|
+
affinity purification. A poly-His run anywhere — or a short run at a chain
|
|
244
|
+
terminus (`histag_terminal_min_run`, catching 6×His tags left partial by
|
|
245
|
+
unmodeled or trimmed residues) — flags the entry's Ni/Co as an `artifact`.
|
|
246
|
+
|
|
247
|
+
But a real audit showed the deeper issue: **~96% of lone Ni/Co entries have no
|
|
248
|
+
His-tag in the deposited sequence at all** (the tag is trimmed from the SEQRES
|
|
249
|
+
record, not just unmodeled), so a sequence scan can never see it. So even with no
|
|
250
|
+
detectable tag, a *lone* Ni/Co (the entry's only metal) with no measured affinity
|
|
251
|
+
is demoted from `functional` to `ambiguous` — reported, not labelled. Real metals
|
|
252
|
+
(Zn, Mg, Fe, …), and Ni/Co backed by an affinity or sitting alongside a genuine
|
|
253
|
+
metal, are untouched. On the full PDB this re-tiers ~2.7% of the metal set.
|
|
226
254
|
|
|
227
255
|
Crucially, **the structure always stays in its split** — a protein with a junk
|
|
228
256
|
ion is still a good backbone; we just don't label the junk. A consumer wanting
|
|
@@ -255,11 +283,54 @@ for epoch in range(3):
|
|
|
255
283
|
batch_ids = ds.train.sample_by_cluster(seed=epoch)
|
|
256
284
|
```
|
|
257
285
|
|
|
286
|
+
## Sharing a split spec
|
|
287
|
+
|
|
288
|
+
The config **is** the shareable recipe. Everything that affects the split lives in
|
|
289
|
+
one small YAML file with a content hash, so you can hand someone that file and they
|
|
290
|
+
reproduce your methodology exactly — like `params.yaml` in DVC. `if-split spec`
|
|
291
|
+
emits a portable, self-identifying version from any build or config:
|
|
292
|
+
|
|
293
|
+
```bash
|
|
294
|
+
# Extract a stand-alone spec from a finished build (config is embedded in the manifest):
|
|
295
|
+
uv run if-split spec data/out/manifest.json --name "my-split" --author "you" \
|
|
296
|
+
--out my-split.ifsplit.yaml
|
|
297
|
+
|
|
298
|
+
# Anyone reproduces your split from just that file:
|
|
299
|
+
uv run if-split build --config my-split.ifsplit.yaml --out their/out
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
The emitted file carries a `spec:` header that announces what it is and pins the
|
|
303
|
+
expected hash:
|
|
304
|
+
|
|
305
|
+
```yaml
|
|
306
|
+
spec:
|
|
307
|
+
ifsplit_spec: ifsplit/config@1 # schema id — the file says what it is
|
|
308
|
+
name: my-split
|
|
309
|
+
author: you
|
|
310
|
+
created_with: if-split 0.1.0
|
|
311
|
+
expected_config_hash: 3b63318286fd2ac4994f34d10936be05
|
|
312
|
+
snapshot_date: '2026-05-31'
|
|
313
|
+
resolution_max_A: 3.5
|
|
314
|
+
# ... all output-affecting settings ...
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
On load, if `expected_config_hash` no longer matches the settings (someone edited
|
|
318
|
+
them after stamping), IF-Split warns. The `spec:` metadata is **excluded from the
|
|
319
|
+
hash**, so name/author/description never change the split identity — two specs that
|
|
320
|
+
differ only in their labels produce byte-identical outputs.
|
|
321
|
+
|
|
322
|
+
| Artifact | Question it answers | Size |
|
|
323
|
+
|---|---|--:|
|
|
324
|
+
| `*.ifsplit.yaml` (or `config.yaml`) | *"How did you make this split?"* — the recipe | ~KB |
|
|
325
|
+
| `manifest.json` | *"What's in it?"* — counts, provenance, file index | ~KB |
|
|
326
|
+
| `dataset.lock` | *"Reproduce the exact bytes"* — pins entry set + candidates SHA | ~MB |
|
|
327
|
+
|
|
258
328
|
## Configuration
|
|
259
329
|
|
|
260
330
|
Everything that affects the output lives in one YAML file
|
|
261
331
|
([`config/default.yaml`](config/default.yaml)); its canonical hash is embedded
|
|
262
|
-
in every manifest, so two builds with the same hash used identical settings.
|
|
332
|
+
in every manifest, so two builds with the same hash used identical settings. It
|
|
333
|
+
doubles as a shareable **split spec** — see [Sharing a split spec](#sharing-a-split-spec).
|
|
263
334
|
|
|
264
335
|
| Key | Default | Meaning |
|
|
265
336
|
|---|---|---|
|
|
@@ -270,8 +341,9 @@ in every manifest, so two builds with the same hash used identical settings.
|
|
|
270
341
|
| `excluded_het` | waters + common ions | Extra components forced to `artifact`. |
|
|
271
342
|
| `use_biological_assembly` | `true` | Count residues from assembly 1, not the deposited asymmetric unit. |
|
|
272
343
|
| `purification_metals` | `[NI, CO]` | Metals treated as IMAC tags; `[]` disables the heuristic. |
|
|
273
|
-
| `histag_min_run` | `6` | His-run length that marks a purification tag. |
|
|
274
|
-
| `
|
|
344
|
+
| `histag_min_run` | `6` | His-run length (anywhere) that marks a purification tag. |
|
|
345
|
+
| `histag_terminal_min_run` | `3` | Shorter His-run at a chain terminus that also counts as a tag (partial/unmodeled 6×His). |
|
|
346
|
+
| `exclude_purification_artifacts` | `true` | Demote His-tag metals to `artifact`; lone uncorroborated Ni/Co → `ambiguous`. |
|
|
275
347
|
| `identity_threshold` | `0.30` | Clustering cutoff (RCSB levels: 30/50/70/90/95/100). |
|
|
276
348
|
| `clustering_backend` | `precomputed` | `precomputed` (RCSB clusters) or `mmseqs2` (run your own). |
|
|
277
349
|
| `split_fractions` | 0.80 / 0.10 / 0.10 | train / val / test. |
|
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
# IF-Split
|
|
2
2
|
|
|
3
|
+
[](https://github.com/WSobo/IF-Split/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/if-split/)
|
|
5
|
+
[](https://pypi.org/project/if-split/)
|
|
6
|
+
|
|
3
7
|
**A reproducible, date-pinned, ligand-aware train/val/test splitter for the PDB.**
|
|
4
8
|
|
|
5
9
|
IF-Split borrows the *split logic* of LigandMPNN (Dauparas et al., *Nature
|
|
@@ -41,16 +45,23 @@ records and sequences. Coordinates are an optional, downstream concern.
|
|
|
41
45
|
|
|
42
46
|
## Install
|
|
43
47
|
|
|
44
|
-
Requires Python ≥ 3.11
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
+
Requires Python ≥ 3.11. `build` needs only network access to RCSB — no external
|
|
49
|
+
binaries.
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install if-split # from PyPI
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Or for development, with [`uv`](https://docs.astral.sh/uv/):
|
|
48
56
|
|
|
49
57
|
```bash
|
|
58
|
+
git clone https://github.com/WSobo/IF-Split && cd IF-Split
|
|
50
59
|
uv sync # creates .venv from uv.lock, installs deps + dev tools (ruff, pytest)
|
|
51
60
|
```
|
|
52
61
|
|
|
53
|
-
`uv.lock` is committed, so environments are reproducible.
|
|
62
|
+
`uv.lock` is committed, so dev environments are reproducible. (The optional
|
|
63
|
+
coordinate/featurization path via `gemmi` is Linux-native, so run under
|
|
64
|
+
Linux/WSL if you use `fetch`.)
|
|
54
65
|
|
|
55
66
|
## Quickstart
|
|
56
67
|
|
|
@@ -72,6 +83,9 @@ uv run if-split build --registry data/out/splits.registry.json --out data/out2
|
|
|
72
83
|
|
|
73
84
|
# OPTIONAL: download the actual structures for a built split (see below).
|
|
74
85
|
uv run if-split fetch data/out/manifest.json --split test --out data/structures
|
|
86
|
+
|
|
87
|
+
# Emit a portable, shareable split spec (see "Sharing a split spec" below).
|
|
88
|
+
uv run if-split spec data/out/manifest.json --name my-split --out my-split.ifsplit.yaml
|
|
75
89
|
```
|
|
76
90
|
|
|
77
91
|
### Outputs (`--out` directory)
|
|
@@ -144,7 +158,7 @@ A `build` runs eight stages; none touch coordinates.
|
|
|
144
158
|
|---|---|---|
|
|
145
159
|
| 1 — enumerate | `enumerate.py`, `rcsb.py` | RCSB Search → entry IDs; Data API (GraphQL, batched) → sequences, ligands, residue counts, cluster membership → `candidates.jsonl`. |
|
|
146
160
|
| 3 — filter | `parse.py` | Drop no-protein / no-sequence / oversized entries (assembly-1 residue count vs `max_total_residues`), plus optional wwPDB validation-report quality caps (clashscore, R-free, Ramachandran/rotamer/RSRZ) — all from metadata. Every drop is logged with its reason. |
|
|
147
|
-
| 4 — ligands | `ligands.py` | Tier each non-protein component `functional`/`ambiguous`/`artifact`; derive class labels (metal / small-molecule /
|
|
161
|
+
| 4 — ligands | `ligands.py` | Tier each non-protein component `functional`/`ambiguous`/`artifact`; derive class labels (metal / small-molecule / nucleic-acid). `nucleic_acid` = a protein↔DNA/RNA *complex* (verified assembly interface), **not** a bound mononucleotide. **Annotate, never drop.** |
|
|
148
162
|
| 5 — cluster | `cluster.py` | Group protein entities by RCSB precomputed cluster id at `identity_threshold`; canonical key = smallest member id. |
|
|
149
163
|
| 6 — split | `split.py` | Deterministic hash → train/val/test; assert no cluster spans two splits; audit residual secondary-chain overlap. |
|
|
150
164
|
| 7 — manifest | `manifest.py` | Emit lock + manifest + registry (all deterministic, no wall-clock fields). |
|
|
@@ -194,16 +208,30 @@ machine-readable reason, from RCSB metadata signals:
|
|
|
194
208
|
**Holo gating (metadata-only).** Presence isn't enough. A small molecule or metal
|
|
195
209
|
is `functional` only if RCSB reports it *contacting* the protein (`bound_components`)
|
|
196
210
|
or it has a measured binding affinity; an unbound one is `ambiguous`. A DNA/RNA
|
|
197
|
-
chain is `functional`
|
|
198
|
-
protein↔nucleic-acid interface (`
|
|
199
|
-
|
|
211
|
+
chain is `functional` `nucleic_acid` only when the biological assembly has a verified
|
|
212
|
+
protein↔nucleic-acid interface (`num_prot_na_interface_entities > 0`) — a
|
|
213
|
+
co-deposited but non-contacting oligo is reported `ambiguous`, never silently
|
|
200
214
|
labelled. (Interfaces are RCSB-computed metadata, available for X-ray *and* cryo-EM,
|
|
201
215
|
so no coordinates are downloaded.)
|
|
202
216
|
|
|
217
|
+
> The `nucleic_acid` class is the protein–nucleic-acid **complex** category (DNA/RNA
|
|
218
|
+
> polymer chains), matching LigandMPNN's "nucleotide" split. Bound *mononucleotide*
|
|
219
|
+
> ligands (ATP, GTP, NAD, SAM, …) are not this class — they fall under
|
|
220
|
+
> `small_molecule`.
|
|
221
|
+
|
|
203
222
|
The His-tag/Ni curation catches a known blemish in the LigandMPNN metal set:
|
|
204
223
|
structures whose only "metal site" is a poly-His tag chelating Ni/Co from
|
|
205
|
-
affinity purification.
|
|
206
|
-
|
|
224
|
+
affinity purification. A poly-His run anywhere — or a short run at a chain
|
|
225
|
+
terminus (`histag_terminal_min_run`, catching 6×His tags left partial by
|
|
226
|
+
unmodeled or trimmed residues) — flags the entry's Ni/Co as an `artifact`.
|
|
227
|
+
|
|
228
|
+
But a real audit showed the deeper issue: **~96% of lone Ni/Co entries have no
|
|
229
|
+
His-tag in the deposited sequence at all** (the tag is trimmed from the SEQRES
|
|
230
|
+
record, not just unmodeled), so a sequence scan can never see it. So even with no
|
|
231
|
+
detectable tag, a *lone* Ni/Co (the entry's only metal) with no measured affinity
|
|
232
|
+
is demoted from `functional` to `ambiguous` — reported, not labelled. Real metals
|
|
233
|
+
(Zn, Mg, Fe, …), and Ni/Co backed by an affinity or sitting alongside a genuine
|
|
234
|
+
metal, are untouched. On the full PDB this re-tiers ~2.7% of the metal set.
|
|
207
235
|
|
|
208
236
|
Crucially, **the structure always stays in its split** — a protein with a junk
|
|
209
237
|
ion is still a good backbone; we just don't label the junk. A consumer wanting
|
|
@@ -236,11 +264,54 @@ for epoch in range(3):
|
|
|
236
264
|
batch_ids = ds.train.sample_by_cluster(seed=epoch)
|
|
237
265
|
```
|
|
238
266
|
|
|
267
|
+
## Sharing a split spec
|
|
268
|
+
|
|
269
|
+
The config **is** the shareable recipe. Everything that affects the split lives in
|
|
270
|
+
one small YAML file with a content hash, so you can hand someone that file and they
|
|
271
|
+
reproduce your methodology exactly — like `params.yaml` in DVC. `if-split spec`
|
|
272
|
+
emits a portable, self-identifying version from any build or config:
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
# Extract a stand-alone spec from a finished build (config is embedded in the manifest):
|
|
276
|
+
uv run if-split spec data/out/manifest.json --name "my-split" --author "you" \
|
|
277
|
+
--out my-split.ifsplit.yaml
|
|
278
|
+
|
|
279
|
+
# Anyone reproduces your split from just that file:
|
|
280
|
+
uv run if-split build --config my-split.ifsplit.yaml --out their/out
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
The emitted file carries a `spec:` header that announces what it is and pins the
|
|
284
|
+
expected hash:
|
|
285
|
+
|
|
286
|
+
```yaml
|
|
287
|
+
spec:
|
|
288
|
+
ifsplit_spec: ifsplit/config@1 # schema id — the file says what it is
|
|
289
|
+
name: my-split
|
|
290
|
+
author: you
|
|
291
|
+
created_with: if-split 0.1.0
|
|
292
|
+
expected_config_hash: 3b63318286fd2ac4994f34d10936be05
|
|
293
|
+
snapshot_date: '2026-05-31'
|
|
294
|
+
resolution_max_A: 3.5
|
|
295
|
+
# ... all output-affecting settings ...
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
On load, if `expected_config_hash` no longer matches the settings (someone edited
|
|
299
|
+
them after stamping), IF-Split warns. The `spec:` metadata is **excluded from the
|
|
300
|
+
hash**, so name/author/description never change the split identity — two specs that
|
|
301
|
+
differ only in their labels produce byte-identical outputs.
|
|
302
|
+
|
|
303
|
+
| Artifact | Question it answers | Size |
|
|
304
|
+
|---|---|--:|
|
|
305
|
+
| `*.ifsplit.yaml` (or `config.yaml`) | *"How did you make this split?"* — the recipe | ~KB |
|
|
306
|
+
| `manifest.json` | *"What's in it?"* — counts, provenance, file index | ~KB |
|
|
307
|
+
| `dataset.lock` | *"Reproduce the exact bytes"* — pins entry set + candidates SHA | ~MB |
|
|
308
|
+
|
|
239
309
|
## Configuration
|
|
240
310
|
|
|
241
311
|
Everything that affects the output lives in one YAML file
|
|
242
312
|
([`config/default.yaml`](config/default.yaml)); its canonical hash is embedded
|
|
243
|
-
in every manifest, so two builds with the same hash used identical settings.
|
|
313
|
+
in every manifest, so two builds with the same hash used identical settings. It
|
|
314
|
+
doubles as a shareable **split spec** — see [Sharing a split spec](#sharing-a-split-spec).
|
|
244
315
|
|
|
245
316
|
| Key | Default | Meaning |
|
|
246
317
|
|---|---|---|
|
|
@@ -251,8 +322,9 @@ in every manifest, so two builds with the same hash used identical settings.
|
|
|
251
322
|
| `excluded_het` | waters + common ions | Extra components forced to `artifact`. |
|
|
252
323
|
| `use_biological_assembly` | `true` | Count residues from assembly 1, not the deposited asymmetric unit. |
|
|
253
324
|
| `purification_metals` | `[NI, CO]` | Metals treated as IMAC tags; `[]` disables the heuristic. |
|
|
254
|
-
| `histag_min_run` | `6` | His-run length that marks a purification tag. |
|
|
255
|
-
| `
|
|
325
|
+
| `histag_min_run` | `6` | His-run length (anywhere) that marks a purification tag. |
|
|
326
|
+
| `histag_terminal_min_run` | `3` | Shorter His-run at a chain terminus that also counts as a tag (partial/unmodeled 6×His). |
|
|
327
|
+
| `exclude_purification_artifacts` | `true` | Demote His-tag metals to `artifact`; lone uncorroborated Ni/Co → `ambiguous`. |
|
|
256
328
|
| `identity_threshold` | `0.30` | Clustering cutoff (RCSB levels: 30/50/70/90/95/100). |
|
|
257
329
|
| `clustering_backend` | `precomputed` | `precomputed` (RCSB clusters) or `mmseqs2` (run your own). |
|
|
258
330
|
| `split_fractions` | 0.80 / 0.10 / 0.10 | train / val / test. |
|
|
@@ -14,11 +14,19 @@ use_biological_assembly: true # biounits, as in LigandMPNN (assembly 1)
|
|
|
14
14
|
# Curation: purification-artifact detection (Stage 4). A poly-His tag that
|
|
15
15
|
# coordinates Ni/Co is an artifact of affinity purification, not a biological
|
|
16
16
|
# metal site — a known blemish in the LigandMPNN metal set. An entry whose only
|
|
17
|
-
# metal is
|
|
18
|
-
#
|
|
19
|
-
#
|
|
17
|
+
# metal is a purification metal AND that carries a His-tag (a >= histag_min_run
|
|
18
|
+
# run anywhere, OR a >= histag_terminal_min_run run at a chain terminus — to
|
|
19
|
+
# catch 6xHis tags left partial by unmodeled/trimmed residues) is flagged as an
|
|
20
|
+
# artifact. Set purification_metals: [] to disable.
|
|
21
|
+
#
|
|
22
|
+
# Even without a detectable tag, a *lone* Ni/Co with no measured affinity is
|
|
23
|
+
# demoted from `functional` to `ambiguous` (reported, not labelled): ~96% of
|
|
24
|
+
# lone Ni/Co in the PDB have no His-tag in the deposited sequence, so a bare
|
|
25
|
+
# contact can't be trusted as a biological metal site. Real metals (Zn, Mg, Fe,
|
|
26
|
+
# Mn, …) and Ni/Co alongside another real metal or with affinity are unaffected.
|
|
20
27
|
purification_metals: ["NI", "CO"]
|
|
21
28
|
histag_min_run: 6
|
|
29
|
+
histag_terminal_min_run: 3
|
|
22
30
|
exclude_purification_artifacts: true
|
|
23
31
|
|
|
24
32
|
identity_threshold: 0.30 # clustering cutoff (Data API levels: 30/50/70/90/95/100)
|
|
@@ -33,7 +41,7 @@ seed: 0
|
|
|
33
41
|
# components (never individual entries -> no leakage) into test, in deterministic
|
|
34
42
|
# hash order, skipping registry-pinned components (growth stays stable). A floor
|
|
35
43
|
# larger than the available supply is met as far as possible; the shortfall is
|
|
36
|
-
# reported in the manifest, not forced. Example: {metal: 500,
|
|
44
|
+
# reported in the manifest, not forced. Example: {metal: 500, nucleic_acid: 200}
|
|
37
45
|
test_min_per_class: {}
|
|
38
46
|
|
|
39
47
|
# Quality filters (Stage 3) — wwPDB validation-report metrics, fetched as
|
|
@@ -18,7 +18,7 @@ uv run if-split build --config examples/IF-Split-2026.05.31/config.yaml --out da
|
|
|
18
18
|
| [`test.json`](test.json) | 100 KB | test-set PDB ids (all of them) |
|
|
19
19
|
| [`test/metal_test.json`](test/metal_test.json) | 36 KB | test ids with a functional **metal** site |
|
|
20
20
|
| [`test/small_molecule_test.json`](test/small_molecule_test.json) | 28 KB | test ids with a functional **small molecule** |
|
|
21
|
-
| [`test/
|
|
21
|
+
| [`test/nucleic_acid_test.json`](test/nucleic_acid_test.json) | 8 KB | test ids that are protein↔**nucleic-acid** complexes (DNA/RNA chains) |
|
|
22
22
|
| [`manifest.json`](manifest.json) | 4 KB | provenance: config, counts, clustering stats, file index |
|
|
23
23
|
| [`config.yaml`](config.yaml) | — | the exact config used (= `config/default.yaml`, cutoff pinned) |
|
|
24
24
|
| [`STATS.txt`](STATS.txt) | — | `if-split stats` output |
|
|
@@ -50,16 +50,16 @@ output).
|
|
|
50
50
|
|
|
51
51
|
## Headline numbers
|
|
52
52
|
|
|
53
|
-
- **223,
|
|
54
|
-
- **214,
|
|
55
|
-
- **34,222** raw RCSB sequence clusters @ 30% identity → **19,
|
|
56
|
-
components** after union-find merged **38,
|
|
53
|
+
- **223,419** candidates (X-ray + EM, ≤ 3.5 Å, released ≤ 2026-05-31)
|
|
54
|
+
- **214,791 kept** — dropped 3,078 no-protein, 5,550 over-size (≥ 6000 residues)
|
|
55
|
+
- **34,222** raw RCSB sequence clusters @ 30% identity → **19,589 leakage-safe
|
|
56
|
+
components** after union-find merged **38,832** multi-chain bridging entries
|
|
57
57
|
|
|
58
58
|
| Split | Entries | Components | Component % |
|
|
59
59
|
|---|--:|--:|--:|
|
|
60
|
-
| train | 188,
|
|
61
|
-
| val | 13,
|
|
62
|
-
| test | 12,
|
|
60
|
+
| train | 188,662 | 15,615 | 79.7% |
|
|
61
|
+
| val | 13,720 | 1,992 | 10.2% |
|
|
62
|
+
| test | 12,409 | 1,982 | 10.1% |
|
|
63
63
|
|
|
64
64
|
**The split is balanced on sequence *components*, not entry counts** — that's why
|
|
65
65
|
train holds ~88% of entries (redundant families like lysozyme carry many entries
|
|
@@ -68,13 +68,17 @@ per component). Splitting on components is what prevents cross-split leakage; us
|
|
|
68
68
|
|
|
69
69
|
## Curation highlights (holo-gated, annotate-never-destroy)
|
|
70
70
|
|
|
71
|
-
- **Test set, functional tier:** metal 4,
|
|
72
|
-
- **Test set, ambiguous (reported, not labelled):** small-molecule
|
|
73
|
-
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
-
|
|
77
|
-
|
|
71
|
+
- **Test set, functional tier:** metal 4,012 · small-molecule 6,713 · nucleic-acid 586
|
|
72
|
+
- **Test set, ambiguous (reported, not labelled):** small-molecule 419 · metal 166 · nucleic-acid 1
|
|
73
|
+
- Functional small molecules far exceed ambiguous (6,713 vs 419): the
|
|
74
|
+
`is_subject_of_investigation` gate recovers non-covalently bound cofactors
|
|
75
|
+
(FAD/NAD/FMN/NADP) and inhibitors that the bond-based contact field misses.
|
|
76
|
+
- The metal ambiguous count includes lone, uncorroborated Ni/Co (likely IMAC
|
|
77
|
+
artifacts whose His-tag is absent from the deposited sequence) — demoted from
|
|
78
|
+
functional, not dropped.
|
|
79
|
+
- **415** His-tag/Ni(Co) purification artifacts flagged and demoted from the metal
|
|
80
|
+
class — the LigandMPNN metal-set blemish, caught automatically (full His run or
|
|
81
|
+
a partial terminal tag).
|
|
78
82
|
|
|
79
83
|
Every structure stays in its split regardless of ligand quality; only the labels
|
|
80
84
|
and confidence tiers change.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
IF-Split-2026.05.31 (config 3b63318286fd2ac4994f34d10936be05)
|
|
2
|
+
candidates: 223419 kept: 214791 dropped: 8628
|
|
3
|
+
- no_protein_entity: 3078
|
|
4
|
+
- too_large: 5550
|
|
5
|
+
clustering: precomputed @ 30% components=19589 (from 34222 raw) multichain=38832
|
|
6
|
+
splits (entries / components):
|
|
7
|
+
train: 188662 entries 15615 components
|
|
8
|
+
val : 13720 entries 1992 components
|
|
9
|
+
test : 12409 entries 1982 components
|
|
10
|
+
test set by ligand class (functional tier):
|
|
11
|
+
metal: 4012
|
|
12
|
+
nucleic_acid: 586
|
|
13
|
+
small_molecule: 6713
|
|
14
|
+
test set ambiguous (reported, not labelled):
|
|
15
|
+
metal: 166
|
|
16
|
+
nucleic_acid: 1
|
|
17
|
+
small_molecule: 419
|
|
18
|
+
His-tag/Ni purification artifacts flagged: 415
|
|
19
|
+
split files: test.json, train.json, val.json
|
|
20
|
+
per-class test files: test/metal_test.json, test/nucleic_acid_test.json, test/small_molecule_test.json
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"candidates": {
|
|
3
|
-
"count":
|
|
4
|
-
"sha256": "
|
|
3
|
+
"count": 223419,
|
|
4
|
+
"sha256": "302da149ddc0bc8ef62e92fe8f249146f9f37119fca7ea4ab5fbe26f3c6b15d7"
|
|
5
5
|
},
|
|
6
6
|
"clustering": {
|
|
7
7
|
"backend": "precomputed",
|
|
8
8
|
"identity": 30,
|
|
9
|
-
"multichain_entries":
|
|
10
|
-
"n_clusters":
|
|
9
|
+
"multichain_entries": 38832,
|
|
10
|
+
"n_clusters": 19589,
|
|
11
11
|
"n_raw_clusters": 34222,
|
|
12
12
|
"unclustered_entries": 316
|
|
13
13
|
},
|
|
@@ -26,6 +26,7 @@
|
|
|
26
26
|
"ELECTRON MICROSCOPY"
|
|
27
27
|
],
|
|
28
28
|
"histag_min_run": 6,
|
|
29
|
+
"histag_terminal_min_run": 3,
|
|
29
30
|
"identity_threshold": 0.3,
|
|
30
31
|
"ligand_context_radius_A": 8.0,
|
|
31
32
|
"max_clashscore": null,
|
|
@@ -49,9 +50,10 @@
|
|
|
49
50
|
"val": 0.1
|
|
50
51
|
},
|
|
51
52
|
"split_salt": "snapsplit-v1",
|
|
53
|
+
"test_min_per_class": {},
|
|
52
54
|
"use_biological_assembly": true
|
|
53
55
|
},
|
|
54
|
-
"config_hash": "
|
|
56
|
+
"config_hash": "3b63318286fd2ac4994f34d10936be05",
|
|
55
57
|
"dataset_version": "IF-Split-2026.05.31",
|
|
56
58
|
"files": {
|
|
57
59
|
"clusters": "clusters.json",
|
|
@@ -64,7 +66,7 @@
|
|
|
64
66
|
},
|
|
65
67
|
"test_by_class": {
|
|
66
68
|
"metal": "test/metal_test.json",
|
|
67
|
-
"
|
|
69
|
+
"nucleic_acid": "test/nucleic_acid_test.json",
|
|
68
70
|
"small_molecule": "test/small_molecule_test.json"
|
|
69
71
|
}
|
|
70
72
|
},
|
|
@@ -74,56 +76,57 @@
|
|
|
74
76
|
"too_large": 5550
|
|
75
77
|
},
|
|
76
78
|
"dropped": 8628,
|
|
77
|
-
"kept":
|
|
79
|
+
"kept": 214791
|
|
78
80
|
},
|
|
79
81
|
"if_split_version": "0.1.0",
|
|
80
82
|
"ligands": {
|
|
81
|
-
"n_purification_artifacts":
|
|
83
|
+
"n_purification_artifacts": 415
|
|
82
84
|
},
|
|
83
85
|
"manifest_schema": "if-split/manifest@2",
|
|
84
86
|
"splits": {
|
|
85
87
|
"cluster_counts": {
|
|
86
|
-
"test":
|
|
87
|
-
"train":
|
|
88
|
-
"val":
|
|
88
|
+
"test": 1982,
|
|
89
|
+
"train": 15615,
|
|
90
|
+
"val": 1992
|
|
89
91
|
},
|
|
90
92
|
"entry_counts": {
|
|
91
|
-
"test":
|
|
92
|
-
"train":
|
|
93
|
-
"val":
|
|
93
|
+
"test": 12409,
|
|
94
|
+
"train": 188662,
|
|
95
|
+
"val": 13720
|
|
94
96
|
},
|
|
95
97
|
"per_split_ambiguous_counts": {
|
|
96
98
|
"test": {
|
|
97
|
-
"metal":
|
|
98
|
-
"
|
|
99
|
-
"small_molecule":
|
|
99
|
+
"metal": 166,
|
|
100
|
+
"nucleic_acid": 1,
|
|
101
|
+
"small_molecule": 419
|
|
100
102
|
},
|
|
101
103
|
"train": {
|
|
102
|
-
"metal":
|
|
103
|
-
"
|
|
104
|
-
"small_molecule":
|
|
104
|
+
"metal": 2061,
|
|
105
|
+
"nucleic_acid": 11,
|
|
106
|
+
"small_molecule": 6144
|
|
105
107
|
},
|
|
106
108
|
"val": {
|
|
107
|
-
"metal":
|
|
108
|
-
"small_molecule":
|
|
109
|
+
"metal": 562,
|
|
110
|
+
"small_molecule": 514
|
|
109
111
|
}
|
|
110
112
|
},
|
|
111
113
|
"per_split_class_counts": {
|
|
112
114
|
"test": {
|
|
113
|
-
"metal":
|
|
114
|
-
"
|
|
115
|
-
"small_molecule":
|
|
115
|
+
"metal": 4012,
|
|
116
|
+
"nucleic_acid": 586,
|
|
117
|
+
"small_molecule": 6713
|
|
116
118
|
},
|
|
117
119
|
"train": {
|
|
118
|
-
"metal":
|
|
119
|
-
"
|
|
120
|
-
"small_molecule":
|
|
120
|
+
"metal": 55113,
|
|
121
|
+
"nucleic_acid": 9948,
|
|
122
|
+
"small_molecule": 102762
|
|
121
123
|
},
|
|
122
124
|
"val": {
|
|
123
|
-
"metal":
|
|
124
|
-
"
|
|
125
|
-
"small_molecule":
|
|
125
|
+
"metal": 4019,
|
|
126
|
+
"nucleic_acid": 539,
|
|
127
|
+
"small_molecule": 6692
|
|
126
128
|
}
|
|
127
|
-
}
|
|
129
|
+
},
|
|
130
|
+
"test_minimum_shortfalls": {}
|
|
128
131
|
}
|
|
129
132
|
}
|